From ccc24e8732c479711beba9c11b0fdfeb85474597 Mon Sep 17 00:00:00 2001
From: Boyuan Yang <byang@debian.org>
Date: Sun, 7 Nov 2021 13:53:56 +0000
Subject: [PATCH] Import libgav1_0.17.0.orig.tar.xz

[dgit import orig libgav1_0.17.0.orig.tar.xz]
---
 .cmake-format.py                              |  126 +
 .gitattributes                                |    1 +
 .gitignore                                    |    2 +
 AUTHORS                                       |    6 +
 CMakeLists.txt                                |  158 +
 CONTRIBUTING.md                               |   27 +
 LICENSE                                       |  202 +
 README.md                                     |  195 +
 cmake/libgav1-config.cmake.template           |    2 +
 cmake/libgav1.pc.template                     |   11 +
 cmake/libgav1_build_definitions.cmake         |  166 +
 cmake/libgav1_cpu_detection.cmake             |   52 +
 cmake/libgav1_flags.cmake                     |  276 +
 cmake/libgav1_helpers.cmake                   |  140 +
 cmake/libgav1_install.cmake                   |   60 +
 cmake/libgav1_intrinsics.cmake                |  135 +
 cmake/libgav1_options.cmake                   |   55 +
 cmake/libgav1_sanitizer.cmake                 |   47 +
 cmake/libgav1_targets.cmake                   |  397 +
 cmake/libgav1_variables.cmake                 |   78 +
 cmake/toolchains/aarch64-linux-gnu.cmake      |   35 +
 cmake/toolchains/android.cmake                |   53 +
 cmake/toolchains/arm-linux-gnueabihf.cmake    |   36 +
 codereview.settings                           |    4 +
 examples/file_reader.cc                       |  186 +
 examples/file_reader.h                        |  100 +
 examples/file_reader_constants.cc             |   23 +
 examples/file_reader_constants.h              |   39 +
 examples/file_reader_factory.cc               |   51 +
 examples/file_reader_factory.h                |   51 +
 examples/file_reader_factory_test.cc          |  114 +
 examples/file_reader_interface.h              |   63 +
 examples/file_reader_test.cc                  |  126 +
 examples/file_reader_test_common.cc           |   43 +
 examples/file_reader_test_common.h            |  171 +
 examples/file_writer.cc                       |  183 +
 examples/file_writer.h                        |  102 +
 examples/file_writer_test.cc                  |  495 +
 examples/gav1_decode.cc                       |  455 +
 examples/gav1_decode_cv_pixel_buffer_pool.cc  |  278 +
 examples/gav1_decode_cv_pixel_buffer_pool.h   |   73 +
 examples/ivf_parser.cc                        |   96 +
 examples/ivf_parser.h                         |   57 +
 examples/libgav1_examples.cmake               |   63 +
 examples/logging.h                            |   65 +
 src/buffer_pool.cc                            |  218 +
 src/buffer_pool.h                             |  402 +
 src/buffer_pool_test.cc                       |  305 +
 src/c_decoder_test.c                          |  440 +
 src/c_version_test.c                          |  102 +
 src/decoder.cc                                |  119 +
 src/decoder_buffer_test.cc                    |   38 +
 src/decoder_impl.cc                           | 1698 ++++
 src/decoder_impl.h                            |  271 +
 src/decoder_settings.cc                       |   33 +
 src/decoder_state.h                           |   91 +
 src/decoder_test.cc                           |  352 +
 src/dsp/arm/average_blend_neon.cc             |  284 +
 src/dsp/arm/average_blend_neon.h              |   36 +
 src/dsp/arm/cdef_neon.cc                      |  804 ++
 src/dsp/arm/cdef_neon.h                       |   41 +
 src/dsp/arm/common_neon.h                     | 1208 +++
 src/dsp/arm/common_neon_test.cc               |  208 +
 src/dsp/arm/convolve_10bit_neon.cc            | 3008 ++++++
 src/dsp/arm/convolve_neon.cc                  | 3099 ++++++
 src/dsp/arm/convolve_neon.h                   |   67 +
 src/dsp/arm/distance_weighted_blend_neon.cc   |  357 +
 src/dsp/arm/distance_weighted_blend_neon.h    |   41 +
 src/dsp/arm/film_grain_neon.cc                | 1479 +++
 src/dsp/arm/film_grain_neon.h                 |   51 +
 src/dsp/arm/intra_edge_neon.cc                |  523 +
 src/dsp/arm/intra_edge_neon.h                 |   42 +
 src/dsp/arm/intrapred_cfl_neon.cc             | 1327 +++
 src/dsp/arm/intrapred_cfl_neon.h              |  179 +
 src/dsp/arm/intrapred_directional_neon.cc     | 2177 +++++
 src/dsp/arm/intrapred_directional_neon.h      |   60 +
 src/dsp/arm/intrapred_filter_neon.cc          |  306 +
 src/dsp/arm/intrapred_filter_neon.h           |   39 +
 src/dsp/arm/intrapred_neon.cc                 | 1626 ++++
 src/dsp/arm/intrapred_neon.h                  |  323 +
 src/dsp/arm/intrapred_smooth_neon.cc          | 1166 +++
 src/dsp/arm/intrapred_smooth_neon.h           |  274 +
 src/dsp/arm/inverse_transform_10bit_neon.cc   | 2785 ++++++
 src/dsp/arm/inverse_transform_neon.cc         | 3211 +++++++
 src/dsp/arm/inverse_transform_neon.h          |   71 +
 src/dsp/arm/loop_filter_neon.cc               | 2454 +++++
 src/dsp/arm/loop_filter_neon.h                |   70 +
 src/dsp/arm/loop_restoration_10bit_neon.cc    | 2652 ++++++
 src/dsp/arm/loop_restoration_neon.cc          | 2424 +++++
 src/dsp/arm/loop_restoration_neon.h           |   44 +
 src/dsp/arm/mask_blend_neon.cc                |  734 ++
 src/dsp/arm/mask_blend_neon.h                 |   48 +
 src/dsp/arm/motion_field_projection_neon.cc   |  378 +
 src/dsp/arm/motion_field_projection_neon.h    |   39 +
 src/dsp/arm/motion_vector_search_neon.cc      |  256 +
 src/dsp/arm/motion_vector_search_neon.h       |   39 +
 src/dsp/arm/obmc_neon.cc                      |  940 ++
 src/dsp/arm/obmc_neon.h                       |   41 +
 src/dsp/arm/super_res_neon.cc                 |  318 +
 src/dsp/arm/super_res_neon.h                  |   40 +
 src/dsp/arm/warp_neon.cc                      |  906 ++
 src/dsp/arm/warp_neon.h                       |   40 +
 src/dsp/arm/weight_mask_neon.cc               |  588 ++
 src/dsp/arm/weight_mask_neon.h                |   70 +
 src/dsp/average_blend.cc                      |  100 +
 src/dsp/average_blend.h                       |   47 +
 src/dsp/average_blend_test.cc                 |  292 +
 src/dsp/cdef.cc                               |  309 +
 src/dsp/cdef.h                                |   48 +
 src/dsp/cdef.inc                              |   29 +
 src/dsp/cdef_test.cc                          |  401 +
 src/dsp/common.h                              |   82 +
 src/dsp/constants.cc                          |  103 +
 src/dsp/constants.h                           |   71 +
 src/dsp/convolve.cc                           |  879 ++
 src/dsp/convolve.h                            |   49 +
 src/dsp/convolve.inc                          |   51 +
 src/dsp/convolve_test.cc                      | 1327 +++
 src/dsp/distance_weighted_blend.cc            |  103 +
 src/dsp/distance_weighted_blend.h             |   47 +
 src/dsp/distance_weighted_blend_test.cc       |  278 +
 src/dsp/dsp.cc                                |  171 +
 src/dsp/dsp.h                                 |  963 ++
 src/dsp/dsp_test.cc                           |  258 +
 src/dsp/film_grain.cc                         |  880 ++
 src/dsp/film_grain.h                          |   47 +
 src/dsp/film_grain_common.h                   |   79 +
 src/dsp/intra_edge.cc                         |  115 +
 src/dsp/intra_edge.h                          |   48 +
 src/dsp/intra_edge_test.cc                    |  518 +
 src/dsp/intrapred.cc                          | 1437 +++
 src/dsp/intrapred.h                           |   47 +
 src/dsp/intrapred_cfl.cc                      |  655 ++
 src/dsp/intrapred_cfl.h                       |   48 +
 src/dsp/intrapred_cfl_test.cc                 |  928 ++
 src/dsp/intrapred_directional.cc              |  249 +
 src/dsp/intrapred_directional.h               |   48 +
 src/dsp/intrapred_directional_test.cc         |  951 ++
 src/dsp/intrapred_filter.cc                   |  144 +
 src/dsp/intrapred_filter.h                    |   49 +
 src/dsp/intrapred_filter_test.cc              |  559 ++
 src/dsp/intrapred_smooth.cc                   |  729 ++
 src/dsp/intrapred_smooth.h                    |   48 +
 src/dsp/intrapred_test.cc                     |  710 ++
 src/dsp/inverse_transform.cc                  | 1630 ++++
 src/dsp/inverse_transform.h                   |   47 +
 src/dsp/inverse_transform.inc                 |   64 +
 src/dsp/inverse_transform_test.cc             |  543 ++
 src/dsp/libgav1_dsp.cmake                     |  203 +
 src/dsp/loop_filter.cc                        |  619 ++
 src/dsp/loop_filter.h                         |   47 +
 src/dsp/loop_filter_test.cc                   |  351 +
 src/dsp/loop_restoration.cc                   |  954 ++
 src/dsp/loop_restoration.h                    |   85 +
 src/dsp/loop_restoration_test.cc              |  638 ++
 src/dsp/mask_blend.cc                         |  212 +
 src/dsp/mask_blend.h                          |   49 +
 src/dsp/mask_blend_test.cc                    |  525 +
 src/dsp/motion_field_projection.cc            |  116 +
 src/dsp/motion_field_projection.h             |   48 +
 src/dsp/motion_field_projection_test.cc       |  213 +
 src/dsp/motion_vector_search.cc               |  187 +
 src/dsp/motion_vector_search.h                |   49 +
 src/dsp/motion_vector_search_test.cc          |  197 +
 src/dsp/obmc.cc                               |  131 +
 src/dsp/obmc.h                                |   47 +
 src/dsp/obmc.inc                              |   32 +
 src/dsp/obmc_test.cc                          |  343 +
 src/dsp/smooth_weights.inc                    |   35 +
 src/dsp/super_res.cc                          |  110 +
 src/dsp/super_res.h                           |   47 +
 src/dsp/super_res_test.cc                     |  264 +
 src/dsp/warp.cc                               |  475 +
 src/dsp/warp.h                                |   47 +
 src/dsp/warp_test.cc                          |  654 ++
 src/dsp/weight_mask.cc                        |  228 +
 src/dsp/weight_mask.h                         |   47 +
 src/dsp/weight_mask_test.cc                   |  390 +
 src/dsp/x86/average_blend_sse4.cc             |  382 +
 src/dsp/x86/average_blend_sse4.h              |   45 +
 src/dsp/x86/cdef_avx2.cc                      |  788 ++
 src/dsp/x86/cdef_avx2.h                       |   45 +
 src/dsp/x86/cdef_sse4.cc                      |  734 ++
 src/dsp/x86/cdef_sse4.h                       |   45 +
 src/dsp/x86/common_avx2.h                     |   89 +
 src/dsp/x86/common_avx2.inc                   |  121 +
 src/dsp/x86/common_avx2_test.cc               |   67 +
 src/dsp/x86/common_sse4.h                     |  130 +
 src/dsp/x86/common_sse4.inc                   |  206 +
 src/dsp/x86/common_sse4_test.cc               |   64 +
 src/dsp/x86/convolve_avx2.cc                  | 1549 +++
 src/dsp/x86/convolve_avx2.h                   |   59 +
 src/dsp/x86/convolve_sse4.cc                  | 1923 ++++
 src/dsp/x86/convolve_sse4.h                   |   75 +
 src/dsp/x86/convolve_sse4.inc                 |  934 ++
 src/dsp/x86/distance_weighted_blend_sse4.cc   |  461 +
 src/dsp/x86/distance_weighted_blend_sse4.h    |   45 +
 src/dsp/x86/film_grain_sse4.cc                |  494 +
 src/dsp/x86/film_grain_sse4.h                 |   40 +
 src/dsp/x86/intra_edge_sse4.cc                |  273 +
 src/dsp/x86/intra_edge_sse4.h                 |   46 +
 src/dsp/x86/intrapred_cfl_sse4.cc             | 1844 ++++
 src/dsp/x86/intrapred_cfl_sse4.h              |  376 +
 src/dsp/x86/intrapred_directional_sse4.cc     | 1478 +++
 src/dsp/x86/intrapred_directional_sse4.h      |   54 +
 src/dsp/x86/intrapred_filter_sse4.cc          |  433 +
 src/dsp/x86/intrapred_filter_sse4.h           |   41 +
 src/dsp/x86/intrapred_smooth_sse4.cc          | 2687 ++++++
 src/dsp/x86/intrapred_smooth_sse4.h           |  318 +
 src/dsp/x86/intrapred_sse4.cc                 | 2200 +++++
 src/dsp/x86/intrapred_sse4.h                  |  591 ++
 src/dsp/x86/inverse_transform_sse4.cc         | 3053 ++++++
 src/dsp/x86/inverse_transform_sse4.h          |   89 +
 src/dsp/x86/loop_filter_sse4.cc               | 2252 +++++
 src/dsp/x86/loop_filter_sse4.h                |  119 +
 src/dsp/x86/loop_restoration_10bit_avx2.cc    | 3163 ++++++
 src/dsp/x86/loop_restoration_10bit_sse4.cc    | 2536 +++++
 src/dsp/x86/loop_restoration_avx2.cc          | 2947 ++++++
 src/dsp/x86/loop_restoration_avx2.h           |   56 +
 src/dsp/x86/loop_restoration_sse4.cc          | 2582 +++++
 src/dsp/x86/loop_restoration_sse4.h           |   56 +
 src/dsp/x86/mask_blend_sse4.cc                |  959 ++
 src/dsp/x86/mask_blend_sse4.h                 |   84 +
 src/dsp/x86/motion_field_projection_sse4.cc   |  382 +
 src/dsp/x86/motion_field_projection_sse4.h    |   41 +
 src/dsp/x86/motion_vector_search_sse4.cc      |  251 +
 src/dsp/x86/motion_vector_search_sse4.h       |   41 +
 src/dsp/x86/obmc_sse4.cc                      |  578 ++
 src/dsp/x86/obmc_sse4.h                       |   49 +
 src/dsp/x86/super_res_sse4.cc                 |  323 +
 src/dsp/x86/super_res_sse4.h                  |   50 +
 src/dsp/x86/transpose_sse4.h                  |  307 +
 src/dsp/x86/warp_sse4.cc                      |  535 ++
 src/dsp/x86/warp_sse4.h                       |   44 +
 src/dsp/x86/weight_mask_sse4.cc               | 1007 ++
 src/dsp/x86/weight_mask_sse4.h                |  171 +
 src/film_grain.cc                             |  828 ++
 src/film_grain.h                              |  195 +
 src/film_grain_test.cc                        | 2360 +++++
 src/frame_buffer.cc                           |  151 +
 src/frame_buffer_utils.h                      |   78 +
 src/frame_scratch_buffer.h                    |  131 +
 src/gav1/decoder.h                            |  148 +
 src/gav1/decoder_buffer.h                     |  272 +
 src/gav1/decoder_settings.h                   |  146 +
 src/gav1/frame_buffer.h                       |  177 +
 src/gav1/status_code.h                        |  118 +
 src/gav1/symbol_visibility.h                  |   93 +
 src/gav1/version.h                            |   71 +
 src/inter_intra_masks.inc                     |  581 ++
 src/internal_frame_buffer_list.cc             |  122 +
 src/internal_frame_buffer_list.h              |   81 +
 src/internal_frame_buffer_list_test.cc        |  158 +
 src/libgav1_decoder.cmake                     |  157 +
 src/loop_restoration_info.cc                  |  240 +
 src/loop_restoration_info.h                   |  102 +
 src/motion_vector.cc                          | 1000 ++
 src/motion_vector.h                           |   61 +
 src/obu_parser.cc                             | 2876 ++++++
 src/obu_parser.h                              |  413 +
 src/obu_parser_test.cc                        | 2675 ++++++
 src/post_filter.h                             |  552 ++
 src/post_filter/cdef.cc                       |  674 ++
 src/post_filter/deblock.cc                    |  507 +
 src/post_filter/deblock_thresholds.inc        |   85 +
 src/post_filter/loop_restoration.cc           |  178 +
 src/post_filter/post_filter.cc                |  626 ++
 src/post_filter/super_res.cc                  |  212 +
 src/post_filter_test.cc                       |  956 ++
 src/prediction_mask.cc                        |  236 +
 src/prediction_mask.h                         |   38 +
 src/prediction_mask_test.cc                   |  214 +
 src/quantizer.cc                              |  269 +
 src/quantizer.h                               |   75 +
 src/quantizer_tables.inc                      | 3080 ++++++
 src/quantizer_test.cc                         |  168 +
 src/reconstruction.cc                         |  190 +
 src/reconstruction.h                          |   54 +
 src/reconstruction_test.cc                    |  294 +
 src/residual_buffer_pool.cc                   |  143 +
 src/residual_buffer_pool.h                    |  148 +
 src/residual_buffer_pool_test.cc              |  201 +
 src/scan_tables.inc                           |  440 +
 src/scan_test.cc                              |   85 +
 src/status_code.cc                            |   57 +
 src/symbol_decoder_context.cc                 |  322 +
 src/symbol_decoder_context.h                  |  301 +
 src/symbol_decoder_context_cdfs.inc           | 2509 +++++
 src/symbol_decoder_context_test.cc            |  264 +
 src/threading_strategy.cc                     |  223 +
 src/threading_strategy.h                      |  131 +
 src/threading_strategy_test.cc                |  281 +
 src/tile.h                                    |  953 ++
 src/tile/bitstream/mode_info.cc               | 1435 +++
 src/tile/bitstream/palette.cc                 |  329 +
 src/tile/bitstream/partition.cc               |  148 +
 src/tile/bitstream/transform_size.cc          |  222 +
 src/tile/prediction.cc                        | 1349 +++
 src/tile/tile.cc                              | 2670 ++++++
 src/tile_scratch_buffer.cc                    |   26 +
 src/tile_scratch_buffer.h                     |  173 +
 src/utils/array_2d.h                          |  131 +
 src/utils/array_2d_test.cc                    |  248 +
 src/utils/bit_mask_set.h                      |   79 +
 src/utils/bit_reader.cc                       |  117 +
 src/utils/bit_reader.h                        |   49 +
 src/utils/block_parameters_holder.cc          |   83 +
 src/utils/block_parameters_holder.h           |   92 +
 src/utils/block_parameters_holder_test.cc     |   76 +
 src/utils/blocking_counter.h                  |   97 +
 src/utils/blocking_counter_test.cc            |  127 +
 src/utils/common.h                            |  555 ++
 src/utils/common_test.cc                      |  604 ++
 src/utils/compiler_attributes.h               |  181 +
 src/utils/constants.cc                        |  874 ++
 src/utils/constants.h                         |  795 ++
 src/utils/cpu.cc                              |   84 +
 src/utils/cpu.h                               |  107 +
 src/utils/cpu_test.cc                         |  248 +
 src/utils/dynamic_buffer.h                    |   85 +
 src/utils/entropy_decoder.cc                  | 1120 +++
 src/utils/entropy_decoder.h                   |  123 +
 src/utils/entropy_decoder_test.cc             | 1259 +++
 src/utils/entropy_decoder_test_data.inc       | 8443 +++++++++++++++++
 src/utils/executor.cc                         |   21 +
 src/utils/executor.h                          |   36 +
 src/utils/libgav1_utils.cmake                 |   70 +
 src/utils/logging.cc                          |   65 +
 src/utils/logging.h                           |   85 +
 src/utils/memory.h                            |  243 +
 src/utils/memory_test.cc                      |  184 +
 src/utils/queue.h                             |  106 +
 src/utils/queue_test.cc                       |   86 +
 src/utils/raw_bit_reader.cc                   |  224 +
 src/utils/raw_bit_reader.h                    |   78 +
 src/utils/raw_bit_reader_test.cc              |  580 ++
 src/utils/reference_info.h                    |   93 +
 src/utils/segmentation.cc                     |   31 +
 src/utils/segmentation.h                      |   32 +
 src/utils/segmentation_map.cc                 |   49 +
 src/utils/segmentation_map.h                  |   71 +
 src/utils/segmentation_map_test.cc            |  120 +
 src/utils/segmentation_test.cc                |   40 +
 src/utils/stack.h                             |   59 +
 src/utils/stack_test.cc                       |   74 +
 src/utils/threadpool.cc                       |  327 +
 src/utils/threadpool.h                        |  167 +
 src/utils/threadpool_test.cc                  |  133 +
 src/utils/types.h                             |  529 ++
 src/utils/unbounded_queue.h                   |  245 +
 src/utils/unbounded_queue_test.cc             |  163 +
 src/utils/vector.h                            |  353 +
 src/utils/vector_test.cc                      |  234 +
 src/version.cc                                |   39 +
 src/version_test.cc                           |   66 +
 src/warp_prediction.cc                        |  240 +
 src/warp_prediction.h                         |   40 +
 src/warp_prediction_test.cc                   |  246 +
 src/yuv_buffer.cc                             |  256 +
 src/yuv_buffer.h                              |  183 +
 tests/block_utils.cc                          |  134 +
 tests/block_utils.h                           |   62 +
 tests/data/five-frames.ivf                    |  Bin 0 -> 883 bytes
 .../ivf-header-and-truncated-frame-header     |  Bin 0 -> 36 bytes
 tests/data/ivf-header-only                    |  Bin 0 -> 32 bytes
 tests/data/ivf-signature-only                 |    1 +
 tests/data/one-frame-large-timestamp.ivf      |  Bin 0 -> 608 bytes
 tests/data/one-frame-truncated.ivf            |  Bin 0 -> 100 bytes
 tests/data/one-frame.ivf                      |  Bin 0 -> 608 bytes
 tests/fuzzer/decoder_fuzzer.cc                |   87 +
 tests/fuzzer/decoder_fuzzer_frame_parallel.cc |  139 +
 tests/fuzzer/fuzzer_temp_file.h               |  148 +
 tests/fuzzer/obu_parser_fuzzer.cc             |   89 +
 tests/libgav1_tests.cmake                     | 1375 +++
 tests/third_party/libvpx/LICENSE              |   30 +
 tests/third_party/libvpx/acm_random.h         |   91 +
 tests/third_party/libvpx/md5_helper.h         |   53 +
 tests/third_party/libvpx/md5_utils.cc         |  249 +
 tests/third_party/libvpx/md5_utils.h          |   41 +
 tests/utils.cc                                |  197 +
 tests/utils.h                                 |  157 +
 tests/utils_test.cc                           |  190 +
 382 files changed, 168116 insertions(+)
 create mode 100644 .cmake-format.py
 create mode 100644 .gitattributes
 create mode 100644 .gitignore
 create mode 100644 AUTHORS
 create mode 100644 CMakeLists.txt
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 cmake/libgav1-config.cmake.template
 create mode 100644 cmake/libgav1.pc.template
 create mode 100644 cmake/libgav1_build_definitions.cmake
 create mode 100644 cmake/libgav1_cpu_detection.cmake
 create mode 100644 cmake/libgav1_flags.cmake
 create mode 100644 cmake/libgav1_helpers.cmake
 create mode 100644 cmake/libgav1_install.cmake
 create mode 100644 cmake/libgav1_intrinsics.cmake
 create mode 100644 cmake/libgav1_options.cmake
 create mode 100644 cmake/libgav1_sanitizer.cmake
 create mode 100644 cmake/libgav1_targets.cmake
 create mode 100644 cmake/libgav1_variables.cmake
 create mode 100644 cmake/toolchains/aarch64-linux-gnu.cmake
 create mode 100644 cmake/toolchains/android.cmake
 create mode 100644 cmake/toolchains/arm-linux-gnueabihf.cmake
 create mode 100644 codereview.settings
 create mode 100644 examples/file_reader.cc
 create mode 100644 examples/file_reader.h
 create mode 100644 examples/file_reader_constants.cc
 create mode 100644 examples/file_reader_constants.h
 create mode 100644 examples/file_reader_factory.cc
 create mode 100644 examples/file_reader_factory.h
 create mode 100644 examples/file_reader_factory_test.cc
 create mode 100644 examples/file_reader_interface.h
 create mode 100644 examples/file_reader_test.cc
 create mode 100644 examples/file_reader_test_common.cc
 create mode 100644 examples/file_reader_test_common.h
 create mode 100644 examples/file_writer.cc
 create mode 100644 examples/file_writer.h
 create mode 100644 examples/file_writer_test.cc
 create mode 100644 examples/gav1_decode.cc
 create mode 100644 examples/gav1_decode_cv_pixel_buffer_pool.cc
 create mode 100644 examples/gav1_decode_cv_pixel_buffer_pool.h
 create mode 100644 examples/ivf_parser.cc
 create mode 100644 examples/ivf_parser.h
 create mode 100644 examples/libgav1_examples.cmake
 create mode 100644 examples/logging.h
 create mode 100644 src/buffer_pool.cc
 create mode 100644 src/buffer_pool.h
 create mode 100644 src/buffer_pool_test.cc
 create mode 100644 src/c_decoder_test.c
 create mode 100644 src/c_version_test.c
 create mode 100644 src/decoder.cc
 create mode 100644 src/decoder_buffer_test.cc
 create mode 100644 src/decoder_impl.cc
 create mode 100644 src/decoder_impl.h
 create mode 100644 src/decoder_settings.cc
 create mode 100644 src/decoder_state.h
 create mode 100644 src/decoder_test.cc
 create mode 100644 src/dsp/arm/average_blend_neon.cc
 create mode 100644 src/dsp/arm/average_blend_neon.h
 create mode 100644 src/dsp/arm/cdef_neon.cc
 create mode 100644 src/dsp/arm/cdef_neon.h
 create mode 100644 src/dsp/arm/common_neon.h
 create mode 100644 src/dsp/arm/common_neon_test.cc
 create mode 100644 src/dsp/arm/convolve_10bit_neon.cc
 create mode 100644 src/dsp/arm/convolve_neon.cc
 create mode 100644 src/dsp/arm/convolve_neon.h
 create mode 100644 src/dsp/arm/distance_weighted_blend_neon.cc
 create mode 100644 src/dsp/arm/distance_weighted_blend_neon.h
 create mode 100644 src/dsp/arm/film_grain_neon.cc
 create mode 100644 src/dsp/arm/film_grain_neon.h
 create mode 100644 src/dsp/arm/intra_edge_neon.cc
 create mode 100644 src/dsp/arm/intra_edge_neon.h
 create mode 100644 src/dsp/arm/intrapred_cfl_neon.cc
 create mode 100644 src/dsp/arm/intrapred_cfl_neon.h
 create mode 100644 src/dsp/arm/intrapred_directional_neon.cc
 create mode 100644 src/dsp/arm/intrapred_directional_neon.h
 create mode 100644 src/dsp/arm/intrapred_filter_neon.cc
 create mode 100644 src/dsp/arm/intrapred_filter_neon.h
 create mode 100644 src/dsp/arm/intrapred_neon.cc
 create mode 100644 src/dsp/arm/intrapred_neon.h
 create mode 100644 src/dsp/arm/intrapred_smooth_neon.cc
 create mode 100644 src/dsp/arm/intrapred_smooth_neon.h
 create mode 100644 src/dsp/arm/inverse_transform_10bit_neon.cc
 create mode 100644 src/dsp/arm/inverse_transform_neon.cc
 create mode 100644 src/dsp/arm/inverse_transform_neon.h
 create mode 100644 src/dsp/arm/loop_filter_neon.cc
 create mode 100644 src/dsp/arm/loop_filter_neon.h
 create mode 100644 src/dsp/arm/loop_restoration_10bit_neon.cc
 create mode 100644 src/dsp/arm/loop_restoration_neon.cc
 create mode 100644 src/dsp/arm/loop_restoration_neon.h
 create mode 100644 src/dsp/arm/mask_blend_neon.cc
 create mode 100644 src/dsp/arm/mask_blend_neon.h
 create mode 100644 src/dsp/arm/motion_field_projection_neon.cc
 create mode 100644 src/dsp/arm/motion_field_projection_neon.h
 create mode 100644 src/dsp/arm/motion_vector_search_neon.cc
 create mode 100644 src/dsp/arm/motion_vector_search_neon.h
 create mode 100644 src/dsp/arm/obmc_neon.cc
 create mode 100644 src/dsp/arm/obmc_neon.h
 create mode 100644 src/dsp/arm/super_res_neon.cc
 create mode 100644 src/dsp/arm/super_res_neon.h
 create mode 100644 src/dsp/arm/warp_neon.cc
 create mode 100644 src/dsp/arm/warp_neon.h
 create mode 100644 src/dsp/arm/weight_mask_neon.cc
 create mode 100644 src/dsp/arm/weight_mask_neon.h
 create mode 100644 src/dsp/average_blend.cc
 create mode 100644 src/dsp/average_blend.h
 create mode 100644 src/dsp/average_blend_test.cc
 create mode 100644 src/dsp/cdef.cc
 create mode 100644 src/dsp/cdef.h
 create mode 100644 src/dsp/cdef.inc
 create mode 100644 src/dsp/cdef_test.cc
 create mode 100644 src/dsp/common.h
 create mode 100644 src/dsp/constants.cc
 create mode 100644 src/dsp/constants.h
 create mode 100644 src/dsp/convolve.cc
 create mode 100644 src/dsp/convolve.h
 create mode 100644 src/dsp/convolve.inc
 create mode 100644 src/dsp/convolve_test.cc
 create mode 100644 src/dsp/distance_weighted_blend.cc
 create mode 100644 src/dsp/distance_weighted_blend.h
 create mode 100644 src/dsp/distance_weighted_blend_test.cc
 create mode 100644 src/dsp/dsp.cc
 create mode 100644 src/dsp/dsp.h
 create mode 100644 src/dsp/dsp_test.cc
 create mode 100644 src/dsp/film_grain.cc
 create mode 100644 src/dsp/film_grain.h
 create mode 100644 src/dsp/film_grain_common.h
 create mode 100644 src/dsp/intra_edge.cc
 create mode 100644 src/dsp/intra_edge.h
 create mode 100644 src/dsp/intra_edge_test.cc
 create mode 100644 src/dsp/intrapred.cc
 create mode 100644 src/dsp/intrapred.h
 create mode 100644 src/dsp/intrapred_cfl.cc
 create mode 100644 src/dsp/intrapred_cfl.h
 create mode 100644 src/dsp/intrapred_cfl_test.cc
 create mode 100644 src/dsp/intrapred_directional.cc
 create mode 100644 src/dsp/intrapred_directional.h
 create mode 100644 src/dsp/intrapred_directional_test.cc
 create mode 100644 src/dsp/intrapred_filter.cc
 create mode 100644 src/dsp/intrapred_filter.h
 create mode 100644 src/dsp/intrapred_filter_test.cc
 create mode 100644 src/dsp/intrapred_smooth.cc
 create mode 100644 src/dsp/intrapred_smooth.h
 create mode 100644 src/dsp/intrapred_test.cc
 create mode 100644 src/dsp/inverse_transform.cc
 create mode 100644 src/dsp/inverse_transform.h
 create mode 100644 src/dsp/inverse_transform.inc
 create mode 100644 src/dsp/inverse_transform_test.cc
 create mode 100644 src/dsp/libgav1_dsp.cmake
 create mode 100644 src/dsp/loop_filter.cc
 create mode 100644 src/dsp/loop_filter.h
 create mode 100644 src/dsp/loop_filter_test.cc
 create mode 100644 src/dsp/loop_restoration.cc
 create mode 100644 src/dsp/loop_restoration.h
 create mode 100644 src/dsp/loop_restoration_test.cc
 create mode 100644 src/dsp/mask_blend.cc
 create mode 100644 src/dsp/mask_blend.h
 create mode 100644 src/dsp/mask_blend_test.cc
 create mode 100644 src/dsp/motion_field_projection.cc
 create mode 100644 src/dsp/motion_field_projection.h
 create mode 100644 src/dsp/motion_field_projection_test.cc
 create mode 100644 src/dsp/motion_vector_search.cc
 create mode 100644 src/dsp/motion_vector_search.h
 create mode 100644 src/dsp/motion_vector_search_test.cc
 create mode 100644 src/dsp/obmc.cc
 create mode 100644 src/dsp/obmc.h
 create mode 100644 src/dsp/obmc.inc
 create mode 100644 src/dsp/obmc_test.cc
 create mode 100644 src/dsp/smooth_weights.inc
 create mode 100644 src/dsp/super_res.cc
 create mode 100644 src/dsp/super_res.h
 create mode 100644 src/dsp/super_res_test.cc
 create mode 100644 src/dsp/warp.cc
 create mode 100644 src/dsp/warp.h
 create mode 100644 src/dsp/warp_test.cc
 create mode 100644 src/dsp/weight_mask.cc
 create mode 100644 src/dsp/weight_mask.h
 create mode 100644 src/dsp/weight_mask_test.cc
 create mode 100644 src/dsp/x86/average_blend_sse4.cc
 create mode 100644 src/dsp/x86/average_blend_sse4.h
 create mode 100644 src/dsp/x86/cdef_avx2.cc
 create mode 100644 src/dsp/x86/cdef_avx2.h
 create mode 100644 src/dsp/x86/cdef_sse4.cc
 create mode 100644 src/dsp/x86/cdef_sse4.h
 create mode 100644 src/dsp/x86/common_avx2.h
 create mode 100644 src/dsp/x86/common_avx2.inc
 create mode 100644 src/dsp/x86/common_avx2_test.cc
 create mode 100644 src/dsp/x86/common_sse4.h
 create mode 100644 src/dsp/x86/common_sse4.inc
 create mode 100644 src/dsp/x86/common_sse4_test.cc
 create mode 100644 src/dsp/x86/convolve_avx2.cc
 create mode 100644 src/dsp/x86/convolve_avx2.h
 create mode 100644 src/dsp/x86/convolve_sse4.cc
 create mode 100644 src/dsp/x86/convolve_sse4.h
 create mode 100644 src/dsp/x86/convolve_sse4.inc
 create mode 100644 src/dsp/x86/distance_weighted_blend_sse4.cc
 create mode 100644 src/dsp/x86/distance_weighted_blend_sse4.h
 create mode 100644 src/dsp/x86/film_grain_sse4.cc
 create mode 100644 src/dsp/x86/film_grain_sse4.h
 create mode 100644 src/dsp/x86/intra_edge_sse4.cc
 create mode 100644 src/dsp/x86/intra_edge_sse4.h
 create mode 100644 src/dsp/x86/intrapred_cfl_sse4.cc
 create mode 100644 src/dsp/x86/intrapred_cfl_sse4.h
 create mode 100644 src/dsp/x86/intrapred_directional_sse4.cc
 create mode 100644 src/dsp/x86/intrapred_directional_sse4.h
 create mode 100644 src/dsp/x86/intrapred_filter_sse4.cc
 create mode 100644 src/dsp/x86/intrapred_filter_sse4.h
 create mode 100644 src/dsp/x86/intrapred_smooth_sse4.cc
 create mode 100644 src/dsp/x86/intrapred_smooth_sse4.h
 create mode 100644 src/dsp/x86/intrapred_sse4.cc
 create mode 100644 src/dsp/x86/intrapred_sse4.h
 create mode 100644 src/dsp/x86/inverse_transform_sse4.cc
 create mode 100644 src/dsp/x86/inverse_transform_sse4.h
 create mode 100644 src/dsp/x86/loop_filter_sse4.cc
 create mode 100644 src/dsp/x86/loop_filter_sse4.h
 create mode 100644 src/dsp/x86/loop_restoration_10bit_avx2.cc
 create mode 100644 src/dsp/x86/loop_restoration_10bit_sse4.cc
 create mode 100644 src/dsp/x86/loop_restoration_avx2.cc
 create mode 100644 src/dsp/x86/loop_restoration_avx2.h
 create mode 100644 src/dsp/x86/loop_restoration_sse4.cc
 create mode 100644 src/dsp/x86/loop_restoration_sse4.h
 create mode 100644 src/dsp/x86/mask_blend_sse4.cc
 create mode 100644 src/dsp/x86/mask_blend_sse4.h
 create mode 100644 src/dsp/x86/motion_field_projection_sse4.cc
 create mode 100644 src/dsp/x86/motion_field_projection_sse4.h
 create mode 100644 src/dsp/x86/motion_vector_search_sse4.cc
 create mode 100644 src/dsp/x86/motion_vector_search_sse4.h
 create mode 100644 src/dsp/x86/obmc_sse4.cc
 create mode 100644 src/dsp/x86/obmc_sse4.h
 create mode 100644 src/dsp/x86/super_res_sse4.cc
 create mode 100644 src/dsp/x86/super_res_sse4.h
 create mode 100644 src/dsp/x86/transpose_sse4.h
 create mode 100644 src/dsp/x86/warp_sse4.cc
 create mode 100644 src/dsp/x86/warp_sse4.h
 create mode 100644 src/dsp/x86/weight_mask_sse4.cc
 create mode 100644 src/dsp/x86/weight_mask_sse4.h
 create mode 100644 src/film_grain.cc
 create mode 100644 src/film_grain.h
 create mode 100644 src/film_grain_test.cc
 create mode 100644 src/frame_buffer.cc
 create mode 100644 src/frame_buffer_utils.h
 create mode 100644 src/frame_scratch_buffer.h
 create mode 100644 src/gav1/decoder.h
 create mode 100644 src/gav1/decoder_buffer.h
 create mode 100644 src/gav1/decoder_settings.h
 create mode 100644 src/gav1/frame_buffer.h
 create mode 100644 src/gav1/status_code.h
 create mode 100644 src/gav1/symbol_visibility.h
 create mode 100644 src/gav1/version.h
 create mode 100644 src/inter_intra_masks.inc
 create mode 100644 src/internal_frame_buffer_list.cc
 create mode 100644 src/internal_frame_buffer_list.h
 create mode 100644 src/internal_frame_buffer_list_test.cc
 create mode 100644 src/libgav1_decoder.cmake
 create mode 100644 src/loop_restoration_info.cc
 create mode 100644 src/loop_restoration_info.h
 create mode 100644 src/motion_vector.cc
 create mode 100644 src/motion_vector.h
 create mode 100644 src/obu_parser.cc
 create mode 100644 src/obu_parser.h
 create mode 100644 src/obu_parser_test.cc
 create mode 100644 src/post_filter.h
 create mode 100644 src/post_filter/cdef.cc
 create mode 100644 src/post_filter/deblock.cc
 create mode 100644 src/post_filter/deblock_thresholds.inc
 create mode 100644 src/post_filter/loop_restoration.cc
 create mode 100644 src/post_filter/post_filter.cc
 create mode 100644 src/post_filter/super_res.cc
 create mode 100644 src/post_filter_test.cc
 create mode 100644 src/prediction_mask.cc
 create mode 100644 src/prediction_mask.h
 create mode 100644 src/prediction_mask_test.cc
 create mode 100644 src/quantizer.cc
 create mode 100644 src/quantizer.h
 create mode 100644 src/quantizer_tables.inc
 create mode 100644 src/quantizer_test.cc
 create mode 100644 src/reconstruction.cc
 create mode 100644 src/reconstruction.h
 create mode 100644 src/reconstruction_test.cc
 create mode 100644 src/residual_buffer_pool.cc
 create mode 100644 src/residual_buffer_pool.h
 create mode 100644 src/residual_buffer_pool_test.cc
 create mode 100644 src/scan_tables.inc
 create mode 100644 src/scan_test.cc
 create mode 100644 src/status_code.cc
 create mode 100644 src/symbol_decoder_context.cc
 create mode 100644 src/symbol_decoder_context.h
 create mode 100644 src/symbol_decoder_context_cdfs.inc
 create mode 100644 src/symbol_decoder_context_test.cc
 create mode 100644 src/threading_strategy.cc
 create mode 100644 src/threading_strategy.h
 create mode 100644 src/threading_strategy_test.cc
 create mode 100644 src/tile.h
 create mode 100644 src/tile/bitstream/mode_info.cc
 create mode 100644 src/tile/bitstream/palette.cc
 create mode 100644 src/tile/bitstream/partition.cc
 create mode 100644 src/tile/bitstream/transform_size.cc
 create mode 100644 src/tile/prediction.cc
 create mode 100644 src/tile/tile.cc
 create mode 100644 src/tile_scratch_buffer.cc
 create mode 100644 src/tile_scratch_buffer.h
 create mode 100644 src/utils/array_2d.h
 create mode 100644 src/utils/array_2d_test.cc
 create mode 100644 src/utils/bit_mask_set.h
 create mode 100644 src/utils/bit_reader.cc
 create mode 100644 src/utils/bit_reader.h
 create mode 100644 src/utils/block_parameters_holder.cc
 create mode 100644 src/utils/block_parameters_holder.h
 create mode 100644 src/utils/block_parameters_holder_test.cc
 create mode 100644 src/utils/blocking_counter.h
 create mode 100644 src/utils/blocking_counter_test.cc
 create mode 100644 src/utils/common.h
 create mode 100644 src/utils/common_test.cc
 create mode 100644 src/utils/compiler_attributes.h
 create mode 100644 src/utils/constants.cc
 create mode 100644 src/utils/constants.h
 create mode 100644 src/utils/cpu.cc
 create mode 100644 src/utils/cpu.h
 create mode 100644 src/utils/cpu_test.cc
 create mode 100644 src/utils/dynamic_buffer.h
 create mode 100644 src/utils/entropy_decoder.cc
 create mode 100644 src/utils/entropy_decoder.h
 create mode 100644 src/utils/entropy_decoder_test.cc
 create mode 100644 src/utils/entropy_decoder_test_data.inc
 create mode 100644 src/utils/executor.cc
 create mode 100644 src/utils/executor.h
 create mode 100644 src/utils/libgav1_utils.cmake
 create mode 100644 src/utils/logging.cc
 create mode 100644 src/utils/logging.h
 create mode 100644 src/utils/memory.h
 create mode 100644 src/utils/memory_test.cc
 create mode 100644 src/utils/queue.h
 create mode 100644 src/utils/queue_test.cc
 create mode 100644 src/utils/raw_bit_reader.cc
 create mode 100644 src/utils/raw_bit_reader.h
 create mode 100644 src/utils/raw_bit_reader_test.cc
 create mode 100644 src/utils/reference_info.h
 create mode 100644 src/utils/segmentation.cc
 create mode 100644 src/utils/segmentation.h
 create mode 100644 src/utils/segmentation_map.cc
 create mode 100644 src/utils/segmentation_map.h
 create mode 100644 src/utils/segmentation_map_test.cc
 create mode 100644 src/utils/segmentation_test.cc
 create mode 100644 src/utils/stack.h
 create mode 100644 src/utils/stack_test.cc
 create mode 100644 src/utils/threadpool.cc
 create mode 100644 src/utils/threadpool.h
 create mode 100644 src/utils/threadpool_test.cc
 create mode 100644 src/utils/types.h
 create mode 100644 src/utils/unbounded_queue.h
 create mode 100644 src/utils/unbounded_queue_test.cc
 create mode 100644 src/utils/vector.h
 create mode 100644 src/utils/vector_test.cc
 create mode 100644 src/version.cc
 create mode 100644 src/version_test.cc
 create mode 100644 src/warp_prediction.cc
 create mode 100644 src/warp_prediction.h
 create mode 100644 src/warp_prediction_test.cc
 create mode 100644 src/yuv_buffer.cc
 create mode 100644 src/yuv_buffer.h
 create mode 100644 tests/block_utils.cc
 create mode 100644 tests/block_utils.h
 create mode 100644 tests/data/five-frames.ivf
 create mode 100644 tests/data/ivf-header-and-truncated-frame-header
 create mode 100644 tests/data/ivf-header-only
 create mode 100644 tests/data/ivf-signature-only
 create mode 100644 tests/data/one-frame-large-timestamp.ivf
 create mode 100644 tests/data/one-frame-truncated.ivf
 create mode 100644 tests/data/one-frame.ivf
 create mode 100644 tests/fuzzer/decoder_fuzzer.cc
 create mode 100644 tests/fuzzer/decoder_fuzzer_frame_parallel.cc
 create mode 100644 tests/fuzzer/fuzzer_temp_file.h
 create mode 100644 tests/fuzzer/obu_parser_fuzzer.cc
 create mode 100644 tests/libgav1_tests.cmake
 create mode 100644 tests/third_party/libvpx/LICENSE
 create mode 100644 tests/third_party/libvpx/acm_random.h
 create mode 100644 tests/third_party/libvpx/md5_helper.h
 create mode 100644 tests/third_party/libvpx/md5_utils.cc
 create mode 100644 tests/third_party/libvpx/md5_utils.h
 create mode 100644 tests/utils.cc
 create mode 100644 tests/utils.h
 create mode 100644 tests/utils_test.cc

diff --git a/.cmake-format.py b/.cmake-format.py
new file mode 100644
index 0000000..90499e5
--- /dev/null
+++ b/.cmake-format.py
@@ -0,0 +1,126 @@
+# Generated with cmake-format 0.5.4
+# --------------------------
+# General Formatting Options
+# --------------------------
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# If the statement spelling length (including space and parenthesis is larger
+# than the tab width by more than this amoung, then force reject un-nested
+# layouts.
+max_prefix_chars = 2
+
+# If a candidate layout is wrapped horizontally but it exceeds this many lines,
+# then reject the layout.
+max_lines_hwrap = 2
+
+# What style line endings to use in the output.
+line_ending = 'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = 'lower'
+
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = 'unchanged'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+  "foo": {
+    "flags": [
+      "BAR",
+      "BAZ"
+    ],
+    "kwargs": {
+      "HEADERS": "*",
+      "SOURCES": "*",
+      "DEPENDS": "*"
+    }
+  }
+}
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+enable_sort = False
+
+# If true, the parsers may infer whether or not an argument list is sortable
+# (without annotation).
+autosort = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}
+
+
+# --------------------------
+# Comment Formatting Options
+# --------------------------
+# What character to use for bulleted lists
+bullet_char = '*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = '.'
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in each
+# listfile. Use this to preserve formatting of your copyright/license
+# statements.
+first_comment_is_literal = True
+
+# If comment markup is enabled, don't reflow any comment block which matches
+# this (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+
+# ---------------------------------
+# Miscellaneous Options
+# ---------------------------------
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = 'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = 'utf-8'
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..b934084
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* whitespace=tab-in-indent,space-before-tab,trailing-space
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..87ccf24
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..d92ea0a
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4029de1
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,158 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+# libgav1 requires C99.
+set(CMAKE_C_STANDARD 99)
+
+project(libgav1 CXX C)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+  message(
+    FATAL_ERROR
+      "Building from within the libgav1 source tree is not supported.\n"
+      "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+      "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+      "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+               "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+               "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
+libgav1_option(
+  NAME LIBGAV1_VERBOSE HELPSTRING
+  "Enables verbose build system output. Higher numbers are more verbose." VALUE
+  OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+get_property(use_folders_is_set GLOBAL PROPERTY USE_FOLDERS SET)
+if(NOT use_folders_is_set)
+  set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
+endif()
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(NOT EXISTS "${libgav1_abseil}")
+  message(
+    FATAL_ERROR
+      "Abseil not found. This dependency is required by the"
+      " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+      " not defined. To continue, download the Abseil repository to"
+      " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone \\\n"
+      "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+endif()
+set(ABSL_PROPAGATE_CXX_STD ON)
+add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_tests_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_ENABLE_TESTS)
+  # include(CTest) or -DBUILD_TESTING=1 aren't used to avoid enabling abseil
+  # tests.
+  enable_testing()
+endif()
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..69140ff
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6744291
--- /dev/null
+++ b/README.md
@@ -0,0 +1,195 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
+information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1.  A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+    recommended.
+
+2.  [CMake >= 3.7.1](https://cmake.org/download/)
+
+3.  [Abseil](https://abseil.io)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+    Note: Abseil is required by the examples and tests. libgav1 will depend on
+    it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4.  (Optional) [GoogleTest](https://github.com/google/googletest)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone https://github.com/google/googletest.git third_party/googletest
+    ```
+
+### Compile
+
+```shell
+  $ mkdir build && cd build
+  $ cmake -G "Unix Makefiles" ..
+  $ make
+```
+
+Configuration options:
+
+*   `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10;
+    default: 10).
+*   `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+    [symbol reduction](#symbol-reduction) in an optimized build to keep all
+    versions of dsp functions available. Automatically defined in
+    `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
+*   `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+    setting this to 0 will also disable AVX2.
+*   `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+    Automatically defined in `src/utils/logging.h` if unset.
+*   `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+    the examples. Automatically defined in `examples/logging.h` if unset.
+*   `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+    coefficient range checks.
+*   `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+    LogSeverity` in `src/utils/logging.h`. Automatically defined in
+    `src/utils/logging.cc` if unset.
+*   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+    absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+    dependency from the core library. Automatically defined in
+    `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+    otherwise.
+*   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+    default value is 128.
+*   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+    is used to determine when to use frame parallel decoding. Frame parallel
+    decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+    be an integer > 0. The default value is 4. This is an advanced setting
+    intended for testing purposes.
+
+For additional options see:
+
+```shell
+  $ cmake .. -LH
+```
+
+## Testing
+
+*   `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+    options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+    convert other container formats to IVF.
+
+*   Unit tests are built when `LIBGAV1_ENABLE_TESTS` is set to `1`. The binaries
+    can be invoked directly or with
+    [`ctest`](https://cmake.org/cmake/help/latest/manual/ctest.1.html).
+
+    *   The test input location can be given by setting the
+        `LIBGAV1_TEST_DATA_PATH` environment variable; it defaults to
+        `<libgav1_src>/tests/data`, where `<libgav1_src>` is `/data/local/tmp`
+        on Android platforms or the source directory configured with cmake
+        otherwise.
+
+    *   Output is written to the value of the `TMPDIR` or `TEMP` environment
+        variables in that order if set, otherwise `/data/local/tmp` on Android
+        platforms, the value of `LIBGAV1_FLAGS_TMPDIR` if defined during
+        compilation or the current directory if not.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+*   `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+    This handles cpu-detection and initializing each logical unit which populate
+    `libgav1::dsp::Dsp` function tables.
+*   `src/dsp/dsp.h` contains function and type definitions for all logical units
+    (e.g., intra-predictors)
+*   `src/utils/cpu.h` contains definitions for cpu-detection
+*   base implementations are located in `src/dsp/*.{h,cc}` with platform
+    specific optimizations in sub-folders
+*   unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+    functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+  #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+  #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  ...
+
+  // In unoptimized code use the following structure; there's no equivalent
+  // define for LIBGAV1_CPU_C as it would require duplicating the function
+  // defines used in optimized code for only a small benefit to this
+  // boilerplate.
+  #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  ...
+  #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/cmake/libgav1-config.cmake.template b/cmake/libgav1-config.cmake.template
new file mode 100644
index 0000000..dc253d3
--- /dev/null
+++ b/cmake/libgav1-config.cmake.template
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/cmake/libgav1.pc.template b/cmake/libgav1.pc.template
new file mode 100644
index 0000000..c571a43
--- /dev/null
+++ b/cmake/libgav1.pc.template
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake
new file mode 100644
index 0000000..0d00bb6
--- /dev/null
+++ b/cmake/libgav1_build_definitions.cmake
@@ -0,0 +1,166 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+  libgav1_load_version_info()
+
+  # Library version info. See the libtool docs for updating the values:
+  # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+  #
+  # c=<current>, r=<revision>, a=<age>
+  #
+  # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+  # passed to libtool.
+  #
+  # We set LIBGAV1_SOVERSION = [c-a].a.r
+  set(LT_CURRENT 0)
+  set(LT_REVISION 1)
+  set(LT_AGE 0)
+  math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+  set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+  unset(LT_CURRENT)
+  unset(LT_REVISION)
+  unset(LT_AGE)
+
+  list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+              "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+  list(APPEND libgav1_gtest_include_paths
+              "third_party/googletest/googlemock/include"
+              "third_party/googletest/googletest/include"
+              "third_party/googletest/googletest")
+  list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+              ${libgav1_gtest_include_paths})
+  list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+              "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+              "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+  if(MSVC OR WIN32)
+    list(APPEND libgav1_defines "_CRT_SECURE_NO_WARNINGS" "NOMINMAX"
+                "_SCL_SECURE_NO_WARNINGS")
+  endif()
+
+  if(ANDROID)
+    if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+      set(CMAKE_ANDROID_ARM_MODE ON)
+    endif()
+
+    if(build_type_lowercase MATCHES "rel")
+      list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+    endif()
+  endif()
+
+  list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+              "-Wno-sign-compare" "-fvisibility=hidden"
+              "-fvisibility-inlines-hidden")
+
+  if(BUILD_SHARED_LIBS)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    set(libgav1_dependency libgav1_shared)
+  else()
+    set(libgav1_dependency libgav1_static)
+  endif()
+
+  list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+              "-Wshorten-64-to-32")
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+      # Quiet warnings in copy-list-initialization where {} elision has always
+      # been allowed.
+      list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+    endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+      list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+    endif()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+      # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+      list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+        list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+      endif()
+    endif()
+  endif()
+
+  if(build_type_lowercase MATCHES "rel")
+    list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
+  endif()
+
+  list(APPEND libgav1_msvc_cxx_flags
+              # Warning level 3.
+              "/W3"
+              # Disable warning C4018:
+              # '<comparison operator>' signed/unsigned mismatch
+              "/wd4018"
+              # Disable warning C4244:
+              # 'argument': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4244"
+              # Disable warning C4267:
+              # '=': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4267"
+              # Disable warning C4309:
+              # 'argument': truncation of constant value
+              "/wd4309"
+              # Disable warning C4551:
+              # function call missing argument list
+              "/wd4551")
+
+  if(BUILD_SHARED_LIBS)
+    list(APPEND libgav1_msvc_cxx_flags
+                # Disable warning C4251:
+                # 'libgav1::DecoderImpl class member' needs to have
+                # dll-interface to be used by clients of class
+                # 'libgav1::Decoder'.
+                "/wd4251")
+  endif()
+
+  if(NOT LIBGAV1_MAX_BITDEPTH)
+    set(LIBGAV1_MAX_BITDEPTH 10)
+  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
+    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+  endif()
+
+  list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+  if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+    if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+      libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+    endif()
+
+    list(APPEND libgav1_defines
+         "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+  endif()
+
+  # Source file names ending in these suffixes will have the appropriate
+  # compiler flags added to their compile commands to enable intrinsics.
+  set(libgav1_avx2_source_file_suffix "avx2(_test)?.cc")
+  set(libgav1_neon_source_file_suffix "neon(_test)?.cc")
+  set(libgav1_sse4_source_file_suffix "sse4(_test)?.cc")
+endmacro()
diff --git a/cmake/libgav1_cpu_detection.cmake b/cmake/libgav1_cpu_detection.cmake
new file mode 100644
index 0000000..d79b83a
--- /dev/null
+++ b/cmake/libgav1_cpu_detection.cmake
@@ -0,0 +1,52 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+  if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+    if(cpu_lowercase MATCHES "^arm|^aarch64")
+      set(libgav1_have_neon ON)
+    elseif(cpu_lowercase MATCHES "^x86|amd64")
+      set(libgav1_have_avx2 ON)
+      set(libgav1_have_sse4 ON)
+    endif()
+  endif()
+
+  if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+    set(libgav1_have_avx2 OFF)
+  endif()
+
+  if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+    set(libgav1_have_neon, OFF)
+  endif()
+
+  if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+    set(libgav1_have_sse4 OFF)
+  endif()
+endmacro()
diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake
new file mode 100644
index 0000000..4f2c4fd
--- /dev/null
+++ b/cmake/libgav1_flags.cmake
@@ -0,0 +1,276 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+  unset(compiler_SOURCES)
+  unset(compiler_FLAGS)
+  unset(optional_args)
+  unset(single_value_args)
+  set(multi_value_args SOURCES FLAGS)
+  cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (compiler_SOURCES AND compiler_FLAGS))
+    libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+                "FLAGS required.")
+  endif()
+
+  set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+                              ${compiler_FLAGS})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    foreach(source ${compiler_SOURCES})
+      foreach(flag ${compiler_FLAGS})
+        message("libgav1_set_compiler_flags_for_sources: source:${source} "
+                "flag:${flag}")
+      endforeach()
+    endforeach()
+  endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+#                       [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+  unset(cxx_test_FLAG_LIST_VAR_NAMES)
+  unset(cxx_test_FLAG_REQUIRED)
+  unset(single_value_args)
+  set(optional_args FLAG_REQUIRED)
+  set(multi_value_args FLAG_LIST_VAR_NAMES)
+  cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+    libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+  endif()
+
+  unset(cxx_flags)
+  foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+    endif()
+    list(APPEND cxx_flags ${${list_var}})
+  endforeach()
+
+  if(LIBGAV1_VERBOSE)
+    message("CXX test: all flags: ${cxx_flags}")
+  endif()
+
+  unset(all_cxx_flags)
+  list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  # Run the actual compile test.
+  unset(libgav1_all_cxx_flags_pass CACHE)
+  message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+  check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+  if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+    libgav1_die("Flag test failed for required flag(s): "
+                "${all_cxx_flags} and FLAG_REQUIRED specified.")
+  endif()
+
+  if(libgav1_all_cxx_flags_pass)
+    # Test passed: update the global flag list used by the libgav1 target
+    # creation wrappers.
+    set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+    if(LIBGAV1_VERBOSE)
+      message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+    endif()
+
+    message("--- Passed combined CXX flags test")
+  else()
+    message("--- Failed combined CXX flags test, testing flags individually.")
+
+    if(cxx_flags)
+      message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+      foreach(cxx_flag ${cxx_flags})
+        # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+        # variable at parent scope while check_cxx_source_compiles() continues
+        # to set an internal cache variable, so we unset both to avoid the
+        # failure / success state persisting between checks. See
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+        unset(cxx_flag_test_passed)
+        unset(cxx_flag_test_passed CACHE)
+        message("--- Testing flag: ${cxx_flag}")
+        check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+        if(cxx_flag_test_passed)
+          message("--- Passed test for ${cxx_flag}")
+        else()
+          list(REMOVE_ITEM cxx_flags ${cxx_flag})
+          message("--- Failed test for ${cxx_flag}, flag removed.")
+        endif()
+      endforeach()
+
+      set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+  endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+  unset(link_FLAG_LIST_VAR_NAME)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args FLAG_LIST_VAR_NAME)
+  cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT link_FLAG_LIST_VAR_NAME)
+    libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+  endif()
+
+  libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+                            ${link_FLAG_LIST_VAR_NAME})
+
+  if(LIBGAV1_VERBOSE)
+    message("EXE LINKER test: all flags: ${linker_flags}")
+  endif()
+
+  # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+  # linker test.
+  libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+                            LIBGAV1_CXX_FLAGS)
+
+  # Cache the global exe linker flags.
+  if(CMAKE_EXE_LINKER_FLAGS)
+    set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+    libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+                              ${linker_flags})
+  endif()
+
+  libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+                            ${CMAKE_EXE_LINKER_FLAGS})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+  unset(linker_flag_test_passed CACHE)
+  set(libgav1_cxx_main "\nint main() { return 0; }")
+  check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+  if(NOT linker_flag_test_passed)
+    libgav1_die("EXE LINKER test failed.")
+  endif()
+
+  message("--- Passed EXE LINKER flag test.")
+
+  # Restore cached global exe linker flags.
+  if(cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
+  else()
+    unset(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+  unset(cxx_flag_lists)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+    list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+  endif()
+
+  # Append clang flags after the base set to allow -Wno* overrides to take
+  # effect. Some of the base flags may enable a large set of warnings, e.g.,
+  # -Wall.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+  endif()
+
+  if(MSVC)
+    list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+  endif()
+
+  if(LIBGAV1_VERBOSE)
+    if(cxx_flag_lists)
+      libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+      message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+    endif()
+  endif()
+
+  libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+  if(LIBGAV1_ENABLE_TESTS)
+    set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+
+    if(NOT CMAKE_CXX_COMPILER_ID STREQUAL CMAKE_C_COMPILER_ID)
+      message(
+        FATAL_ERROR
+          "C/CXX compiler mismatch (${CMAKE_C_COMPILER_ID} vs"
+          " ${CMAKE_CXX_COMPILER_ID})! Compiler flags are only tested using"
+          " CMAKE_CXX_COMPILER, rerun cmake with CMAKE_C_COMPILER set to the"
+          " C compiler from the same package as CMAKE_CXX_COMPILER to ensure"
+          " the build completes successfully.")
+    endif()
+    set(LIBGAV1_TEST_C_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_C_FLAGS EXCLUDE REGEX
+         "-fvisibility-inlines-hidden")
+  endif()
+endmacro()
diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake
new file mode 100644
index 0000000..ac16257
--- /dev/null
+++ b/cmake/libgav1_helpers.cmake
@@ -0,0 +1,140 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+  # macro parameters are not variables so a temporary is needed to work with
+  # list().
+  set(msg ${ARGN})
+  # message(${ARGN}) will merge all list elements with no separator while
+  # "${ARGN}" will output the list as a ';' delimited string.
+  list(JOIN msg " " msg)
+  message(FATAL_ERROR "${msg}")
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+  set(optional_args)
+  set(single_value_args DEST SOURCE_VAR)
+  set(multi_value_args SOURCE SOURCE_VARS)
+  cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+    libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+                "SOURCE_VARS required.")
+  endif()
+
+  unset(${sas_DEST})
+
+  if(sas_SOURCE)
+    # $sas_SOURCE is one or more expanded variables, just copy the values to
+    # $sas_DEST.
+    set(${sas_DEST} "${sas_SOURCE}")
+  endif()
+
+  if(sas_SOURCE_VARS)
+    # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+    # variable and appends it to $sas_DEST.
+    foreach(source_var ${sas_SOURCE_VARS})
+      set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+    endforeach()
+
+    # Because $sas_DEST can be empty when entering this scope leading whitespace
+    # can be introduced to $sas_DEST on the first iteration of the above loop.
+    # Remove it:
+    string(STRIP "${${sas_DEST}}" ${sas_DEST})
+  endif()
+
+  # Lists in CMake are simply semicolon delimited strings, so stringification is
+  # just a find and replace of the semicolon.
+  string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+  endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+  set(optional_args)
+  set(single_value_args TARGET BASENAME LISTVAR)
+  set(multi_value_args)
+  cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+    libgav1_die(
+      "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+  endif()
+
+  if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+    set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+  endif()
+
+  set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+  set(dummy_source_file
+      "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+  set(dummy_source_code
+      "// Generated file. DO NOT EDIT!\n"
+      "// C++ source file created for target ${cdsf_TARGET}.\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+  file(WRITE "${dummy_source_file}" ${dummy_source_code})
+
+  target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+  if(cdsf_LISTVAR)
+    list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+  endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+#   - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+  file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+  foreach(str ${version_file_strings})
+    if(str MATCHES "#define LIBGAV1_")
+      if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+        string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+                       "${str}")
+      endif()
+    endif()
+  endforeach()
+  set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+  set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake
new file mode 100644
index 0000000..b7f6006
--- /dev/null
+++ b/cmake/libgav1_install.cmake
@@ -0,0 +1,60 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+  if(NOT (MSVC OR XCODE))
+    include(GNUInstallDirs)
+
+    # pkg-config: libgav1.pc
+    set(prefix "${CMAKE_INSTALL_PREFIX}")
+    set(exec_prefix "\${prefix}")
+    set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+    set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    set(libgav1_lib_name "libgav1")
+
+    configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+                   "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+    install(FILES "${libgav1_build}/libgav1.pc"
+            DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+    # CMake config: libgav1-config.cmake
+    set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+                   "${libgav1_build}/libgav1-config.cmake" @ONLY
+                   NEWLINE_STYLE UNIX)
+    install(
+      FILES "${libgav1_build}/libgav1-config.cmake"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+    install(
+      FILES ${libgav1_api_includes}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+    install(TARGETS gav1_decode DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    install(TARGETS libgav1_static DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS libgav1_shared DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_intrinsics.cmake b/cmake/libgav1_intrinsics.cmake
new file mode 100644
index 0000000..a2e9ddb
--- /dev/null
+++ b/cmake/libgav1_intrinsics.cmake
@@ -0,0 +1,135 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+  unset(intrinsics_SUFFIX)
+  unset(intrinsics_VARIABLE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args SUFFIX VARIABLE)
+  cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+                        "VARIABLE required.")
+  endif()
+
+  if(intrinsics_SUFFIX MATCHES "neon")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "avx2")
+    if(MSVC)
+      set(${intrinsics_VARIABLE} "/arch:AVX2")
+    else()
+      set(${intrinsics_VARIABLE} "-mavx2")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "sse4")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "-msse4.1")
+    endif()
+  else()
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+                        "instrinics suffix: ${intrinsics_SUFFIX}")
+  endif()
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_get_intrinsics_flag_for_suffix: "
+            "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+  endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports AVX2 and SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+  unset(arg_TARGET)
+  unset(arg_SOURCES)
+  unset(optional_args)
+  set(single_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+  if(NOT (arg_TARGET AND arg_SOURCES))
+    message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+                        "SOURCES required.")
+  endif()
+
+  if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+    unset(avx2_sources)
+    list(APPEND avx2_sources ${arg_SOURCES})
+
+    list(FILTER avx2_sources INCLUDE REGEX
+         "${libgav1_avx2_source_file_suffix}$")
+
+    if(avx2_sources)
+      unset(avx2_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_avx2_source_file_suffix}
+                                             VARIABLE avx2_flags)
+      if(avx2_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+                                               ${avx2_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+    unset(sse4_sources)
+    list(APPEND sse4_sources ${arg_SOURCES})
+
+    list(FILTER sse4_sources INCLUDE REGEX
+         "${libgav1_sse4_source_file_suffix}$")
+
+    if(sse4_sources)
+      unset(sse4_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_sse4_source_file_suffix}
+                                             VARIABLE sse4_flags)
+      if(sse4_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+                                               ${sse4_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+    unset(neon_sources)
+    list(APPEND neon_sources ${arg_SOURCES})
+    list(FILTER neon_sources INCLUDE REGEX
+         "${libgav1_neon_source_file_suffix}$")
+
+    if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+      unset(neon_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_neon_source_file_suffix}
+                                             VARIABLE neon_flags)
+      if(neon_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+                                               ${neon_flags})
+      endif()
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_options.cmake b/cmake/libgav1_options.cmake
new file mode 100644
index 0000000..6327bee
--- /dev/null
+++ b/cmake/libgav1_options.cmake
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+  unset(option_NAME)
+  unset(option_HELPSTRING)
+  unset(option_VALUE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args NAME HELPSTRING VALUE)
+  cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+    message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+  endif()
+
+  option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("--------- libgav1_option ---------\n"
+            "option_NAME=${option_NAME}\n"
+            "option_HELPSTRING=${option_HELPSTRING}\n"
+            "option_VALUE=${option_VALUE}\n"
+            "------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_options ${option_NAME})
+  list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+  foreach(option_name ${libgav1_options})
+    message("${option_name}: ${${option_name}}")
+  endforeach()
+endmacro()
diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake
new file mode 100644
index 0000000..2f9ee07
--- /dev/null
+++ b/cmake/libgav1_sanitizer.cmake
@@ -0,0 +1,47 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+  if(LIBGAV1_SANITIZE AND NOT MSVC)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      if(LIBGAV1_SANITIZE MATCHES "cfi")
+        list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+                    "-fuse-ld=gold")
+      endif()
+
+      if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+         AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+      endif()
+    endif()
+
+    list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+    list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+    # Make sanitizer callstacks accurate.
+    list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+                "-fno-optimize-sibling-calls")
+
+    # Check the linker flags first as they may be required in the compile check
+    # to avoid undefined symbols related to the sanitizer.
+    libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+  endif()
+endmacro()
diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake
new file mode 100644
index 0000000..f8326a9
--- /dev/null
+++ b/cmake/libgav1_targets.cmake
@@ -0,0 +1,397 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+if(LIBGAV1_IDE_FOLDER)
+  set(LIBGAV1_EXAMPLES_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/examples")
+  set(LIBGAV1_TESTS_IDE_FOLDER "${LIBGAV1_IDE_FOLDER}/tests")
+else()
+  set(LIBGAV1_EXAMPLES_IDE_FOLDER "libgav1_examples")
+  set(LIBGAV1_TESTS_IDE_FOLDER "libgav1_tests")
+endif()
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+  unset(libgav1_targets)
+  unset(libgav1_exe_targets)
+  unset(libgav1_lib_targets)
+  unset(libgav1_objlib_targets)
+  unset(libgav1_sources)
+  unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME.
+#   - TEST: Flag. Presence means treat executable as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+  unset(exe_TEST)
+  unset(exe_TEST_DEFINES_MAIN)
+  unset(exe_NAME)
+  unset(exe_OUTPUT_NAME)
+  unset(exe_SOURCES)
+  unset(exe_DEFINES)
+  unset(exe_INCLUDES)
+  unset(exe_COMPILE_FLAGS)
+  unset(exe_LINK_FLAGS)
+  unset(exe_OBJLIB_DEPS)
+  unset(exe_LIB_DEPS)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS)
+
+  cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_executable ---------\n"
+            "exe_TEST=${exe_TEST}\n"
+            "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+            "exe_NAME=${exe_NAME}\n"
+            "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+            "exe_SOURCES=${exe_SOURCES}\n"
+            "exe_DEFINES=${exe_DEFINES}\n"
+            "exe_INCLUDES=${exe_INCLUDES}\n"
+            "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+            "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+            "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+            "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+            "------------------------------------------\n")
+  endif()
+
+  if(NOT (exe_NAME AND exe_SOURCES))
+    message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${exe_NAME})
+  if(exe_TEST)
+    list(APPEND libgav1_test_targets ${exe_NAME})
+    list(APPEND libgav1_test_sources ${exe_SOURCES})
+  else()
+    list(APPEND libgav1_exe_targets ${exe_NAME})
+    list(APPEND libgav1_sources ${exe_SOURCES})
+  endif()
+
+  add_executable(${exe_NAME} ${exe_SOURCES})
+  if(exe_TEST)
+    add_test(NAME ${exe_NAME} COMMAND ${exe_NAME})
+    set_property(TARGET ${exe_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+  else()
+    set_property(TARGET ${exe_NAME}
+                 PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+  endif()
+
+  if(exe_OUTPUT_NAME)
+    set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+  endif()
+
+  libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+  if(exe_DEFINES)
+    target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+  endif()
+
+  if(exe_INCLUDES)
+    target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+  endif()
+
+  unset(exe_LIBGAV1_COMPILE_FLAGS)
+  if(exe_TEST)
+    list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+    list(LENGTH exe_SOURCES exe_SOURCES_length)
+    if(exe_SOURCES_length EQUAL 0)
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    else()
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+    endif()
+  else()
+    set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
+    target_compile_options(${exe_NAME}
+                           PRIVATE ${exe_COMPILE_FLAGS}
+                                   ${exe_LIBGAV1_COMPILE_FLAGS})
+  endif()
+
+  if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+    list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+    if(${CMAKE_VERSION} VERSION_LESS "3.13")
+      # LINK_FLAGS is managed as a string.
+      libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+      set_target_properties(${exe_NAME}
+                            PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+    else()
+      target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+    endif()
+  endif()
+
+  if(exe_OBJLIB_DEPS)
+    foreach(objlib_dep ${exe_OBJLIB_DEPS})
+      target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+  endif()
+
+  if(exe_LIB_DEPS)
+    unset(exe_static)
+    if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+      set(exe_static ON)
+    endif()
+
+    if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      # Third party dependencies can introduce dependencies on system and test
+      # libraries. Since the target created here is an executable, and CMake
+      # does not provide a method of controlling order of link dependencies,
+      # wrap all of the dependencies of this target in start/end group flags to
+      # ensure that dependencies of third party targets can be resolved when
+      # those dependencies happen to be resolved by dependencies of the current
+      # target.
+      list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+      list(APPEND exe_LIB_DEPS -Wl,--end-group)
+    endif()
+    target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+  endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+#     is generating a build for which MSVC or WIN32 are true. This is to avoid
+#     output basename collisions with DLL import libraries.
+#   - TEST: Flag. Presence means treat library as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+#   - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+#   - When TEST is specified sources are added to $libgav1_test_sources.
+#   - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+#   - Targets are always added to $libgav1_targets.
+#   - When the TEST flag is specified, targets are added to
+#     $libgav1_test_targets.
+#   - When TEST is not specified:
+#     - Libraries of type SHARED are added to $libgav1_dylib_targets.
+#     - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+#     - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+  unset(lib_TEST)
+  unset(lib_NAME)
+  unset(lib_OUTPUT_NAME)
+  unset(lib_TYPE)
+  unset(lib_SOURCES)
+  unset(lib_DEFINES)
+  unset(lib_INCLUDES)
+  unset(lib_COMPILE_FLAGS)
+  unset(lib_LINK_FLAGS)
+  unset(lib_OBJLIB_DEPS)
+  unset(lib_LIB_DEPS)
+  unset(lib_PUBLIC_INCLUDES)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME TYPE)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+  cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_library ---------\n"
+            "lib_TEST=${lib_TEST}\n"
+            "lib_NAME=${lib_NAME}\n"
+            "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+            "lib_TYPE=${lib_TYPE}\n"
+            "lib_SOURCES=${lib_SOURCES}\n"
+            "lib_DEFINES=${lib_DEFINES}\n"
+            "lib_INCLUDES=${lib_INCLUDES}\n"
+            "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+            "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+            "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+            "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+            "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+            "---------------------------------------\n")
+  endif()
+
+  if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+    message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${lib_NAME})
+  if(lib_TEST)
+    list(APPEND libgav1_test_targets ${lib_NAME})
+    list(APPEND libgav1_test_sources ${lib_SOURCES})
+  else()
+    list(APPEND libgav1_sources ${lib_SOURCES})
+    if(lib_TYPE STREQUAL OBJECT)
+      list(APPEND libgav1_objlib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL SHARED)
+      list(APPEND libgav1_dylib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL STATIC)
+      list(APPEND libgav1_lib_targets ${lib_NAME})
+    else()
+      message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+    endif()
+  endif()
+
+  add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+  libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+  if(lib_OUTPUT_NAME)
+    if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+      set_target_properties(${lib_NAME}
+                            PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+    endif()
+  endif()
+
+  if(lib_DEFINES)
+    target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+  endif()
+
+  if(lib_INCLUDES)
+    target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+  endif()
+
+  if(lib_PUBLIC_INCLUDES)
+    target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+  endif()
+
+  if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${lib_NAME}
+                           PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(lib_LINK_FLAGS)
+    set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+  endif()
+
+  if(lib_OBJLIB_DEPS)
+    foreach(objlib_dep ${lib_OBJLIB_DEPS})
+      target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(lib_LIB_DEPS)
+    if(lib_TYPE STREQUAL STATIC)
+      set(link_type PUBLIC)
+    else()
+      set(link_type PRIVATE)
+      if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+        # The libgav1 shared object uses the static libgav1 as input to turn it
+        # into a shared object. Include everything from the static library in
+        # the shared object.
+        if(APPLE)
+          list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+        else()
+          list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+          list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+        endif()
+      endif()
+    endif()
+    target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+  endif()
+
+  if(NOT MSVC AND lib_NAME MATCHES "^lib")
+    # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+    # already includes lib in its name. Avoid naming output files liblib*.
+    set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+  endif()
+
+  if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+    set_target_properties(${lib_NAME}
+                          PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+                                     ${LIBGAV1_SOVERSION_MAJOR})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    if(lib_TYPE STREQUAL SHARED)
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+    else()
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    endif()
+  endif()
+
+  # Determine if $lib_NAME is a header only target.
+  set(sources_list ${lib_SOURCES})
+  list(FILTER sources_list INCLUDE REGEX cc$)
+  if(NOT sources_list)
+    if(NOT XCODE)
+      # This is a header only target. Tell CMake the link language.
+      set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+    else()
+      # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+      libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+    endif()
+  endif()
+
+  if(lib_TEST)
+    set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_TESTS_IDE_FOLDER})
+  else()
+    set(sources_list ${lib_SOURCES})
+    list(FILTER sources_list INCLUDE REGEX examples)
+    if(sources_list)
+      set_property(TARGET ${lib_NAME}
+                   PROPERTY FOLDER ${LIBGAV1_EXAMPLES_IDE_FOLDER})
+    else()
+      set_property(TARGET ${lib_NAME} PROPERTY FOLDER ${LIBGAV1_IDE_FOLDER})
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/libgav1_variables.cmake b/cmake/libgav1_variables.cmake
new file mode 100644
index 0000000..0dd0f37
--- /dev/null
+++ b/cmake/libgav1_variables.cmake
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+  if("${variable_name}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable_name passed to libgav1_variable_must_be_directory.")
+  endif()
+
+  if("${${variable_name}}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable ${variable_name} is required to build libgav1.")
+  endif()
+
+  if(NOT IS_DIRECTORY "${${variable_name}}")
+    message(
+      FATAL_ERROR
+        "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+        "directory.")
+  endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("---- libgav1_track_configuration_variable ----\n"
+            "var_name=${var_name}\n"
+            "----------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_configuration_variables ${var_name})
+  list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+  unset(flag_variables)
+  list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+              "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+  if(CMAKE_BUILD_TYPE)
+    list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+  endif()
+  foreach(flag_variable ${flag_variables})
+    message("${flag_variable}:${${flag_variable}}")
+  endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+  foreach(config_variable ${libgav1_configuration_variables})
+    message("${config_variable}:${${config_variable}}")
+  endforeach()
+endmacro()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644
index 0000000..fdcb012
--- /dev/null
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,35 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv8-a")
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake
new file mode 100644
index 0000000..492957b
--- /dev/null
+++ b/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit targets (instead of the default thumb) to improve
+# performance.
+if(NOT ANDROID_ARM_MODE)
+  set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+  set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+  set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+  message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000..7448f54
--- /dev/null
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,36 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+# For c_decoder_test.c and c_version_test.c.
+if(NOT CMAKE_C_COMPILER)
+  set(CMAKE_C_COMPILER ${CROSS}gcc)
+endif()
+set(CMAKE_C_FLAGS_INIT "-march=armv7-a -marm")
+if(NOT CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER ${CROSS}g++)
+endif()
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/codereview.settings b/codereview.settings
new file mode 100644
index 0000000..ccba2ee
--- /dev/null
+++ b/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/examples/file_reader.cc b/examples/file_reader.cc
new file mode 100644
index 0000000..b096722
--- /dev/null
+++ b/examples/file_reader.cc
@@ -0,0 +1,186 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+}  // namespace
+
+bool FileReader::registered_in_factory_ =
+    FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+  if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+    const std::string& file_name, const bool error_tolerant) {
+  if (file_name.empty()) return nullptr;
+
+  FILE* raw_file_ptr;
+
+  bool owns_file = true;
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdin);
+    owns_file = false;  // stdin is owned by the Standard C Library.
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "rb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    return nullptr;
+  }
+
+  std::unique_ptr<FileReader> file(
+      new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    if (owns_file) fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (!file->ReadIvfFileHeader()) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+    return nullptr;
+  }
+
+  return file;
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    size of frame in bytes (not including the 12-byte header)
+// bytes 4-11   64-bit presentation timestamp
+// bytes 12..   frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+                                  int64_t* const timestamp) {
+  if (tu_data == nullptr) return false;
+  tu_data->clear();
+
+  uint8_t header_buffer[kIvfFrameHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+  if (IsEndOfFile()) {
+    if (num_read != 0) {
+      LIBGAV1_EXAMPLES_LOG_ERROR(
+          "Cannot read IVF frame header: Not enough data available");
+      return false;
+    }
+
+    return true;
+  }
+
+  IvfFrameHeader ivf_frame_header;
+  if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+    if (error_tolerant_) {
+      ivf_frame_header.frame_size =
+          std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+    } else {
+      return false;
+    }
+  }
+
+  if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+  tu_data->resize(ivf_frame_header.frame_size);
+  const size_t size_read =
+      fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+  if (size_read != ivf_frame_header.frame_size) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Unexpected EOF or I/O error reading frame data");
+    if (error_tolerant_) {
+      tu_data->resize(size_read);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    signature: 'DKIF'
+// bytes 4-5    version (should be 0)
+// bytes 6-7    length of header in bytes
+// bytes 8-11   codec FourCC (e.g., 'VP80')
+// bytes 12-13  width in pixels
+// bytes 14-15  height in pixels
+// bytes 16-19  frame rate
+// bytes 20-23  time scale
+// bytes 24-27  number of frames in file
+// bytes 28-31  unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19  frame rate  timebase.den  framerate.numerator
+// bytes 20-23  time scale  timebase.num  framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+  uint8_t header_buffer[kIvfFileHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+  if (num_read != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Cannot read IVF header: Not enough data available");
+    return false;
+  }
+
+  IvfFileHeader ivf_file_header;
+  if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+    if (error_tolerant_) {
+      ivf_file_header = {};
+    } else {
+      return false;
+    }
+  }
+
+  width_ = ivf_file_header.width;
+  height_ = ivf_file_header.height;
+  frame_rate_ = ivf_file_header.frame_rate_numerator;
+  time_scale_ = ivf_file_header.frame_rate_denominator;
+  type_ = kFileTypeIvf;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader.h b/examples/file_reader.h
new file mode 100644
index 0000000..c342a20
--- /dev/null
+++ b/examples/file_reader.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+  enum FileType {
+    kFileTypeUnknown,
+    kFileTypeIvf,
+  };
+
+  // Creates and returns a FileReader that reads from |file_name|.
+  // If |error_tolerant| is true format and read errors are ignored,
+  // ReadTemporalUnit() may return truncated data.
+  // Returns nullptr when the file does not exist, cannot be read, or is not an
+  // IVF file.
+  static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+                                                   bool error_tolerant = false);
+
+  FileReader() = delete;
+  FileReader(const FileReader&) = delete;
+  FileReader& operator=(const FileReader&) = delete;
+
+  // Closes |file_|.
+  ~FileReader() override;
+
+  // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp from the IVF frame header.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+    return feof(file_) != 0;
+  }
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from the IVF file header.
+  size_t width() const override { return width_; }
+  size_t height() const override { return height_; }
+  size_t frame_rate() const override { return frame_rate_; }
+  size_t time_scale() const override { return time_scale_; }
+
+ private:
+  FileReader(FILE* file, bool owns_file, bool error_tolerant)
+      : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+  bool ReadIvfFileHeader();
+
+  FILE* file_ = nullptr;
+  size_t width_ = 0;
+  size_t height_ = 0;
+  size_t frame_rate_ = 0;
+  size_t time_scale_ = 0;
+  FileType type_ = kFileTypeUnknown;
+  // True if this object owns file_ and is responsible for closing it when
+  // done.
+  const bool owns_file_;
+  const bool error_tolerant_;
+
+  static bool registered_in_factory_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/examples/file_reader_constants.cc b/examples/file_reader_constants.cc
new file mode 100644
index 0000000..8439071
--- /dev/null
+++ b/examples/file_reader_constants.cc
@@ -0,0 +1,23 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_constants.h"
+
+namespace libgav1 {
+
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
+
+}  // namespace libgav1
diff --git a/examples/file_reader_constants.h b/examples/file_reader_constants.h
new file mode 100644
index 0000000..00922b4
--- /dev/null
+++ b/examples/file_reader_constants.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+  kIvfHeaderVersion = 0,
+  kIvfFrameHeaderSize = 12,
+  kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  kMaxTemporalUnitSize = 512 * 1024,
+#else
+  kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/examples/file_reader_factory.cc b/examples/file_reader_factory.cc
new file mode 100644
index 0000000..d5260eb
--- /dev/null
+++ b/examples/file_reader_factory.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+  static auto* open_functions =
+      new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+  return open_functions;
+}
+
+}  // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+  if (open_function == nullptr) return false;
+  auto* open_functions = GetFileReaderOpenFunctions();
+  const size_t num_readers = open_functions->size();
+  open_functions->push_back(open_function);
+  return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+    const std::string& file_name, const bool error_tolerant /*= false*/) {
+  for (auto* open_function : *GetFileReaderOpenFunctions()) {
+    auto reader = open_function(file_name, error_tolerant);
+    if (reader == nullptr) continue;
+    return reader;
+  }
+  LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+  return nullptr;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader_factory.h b/examples/file_reader_factory.h
new file mode 100644
index 0000000..0f53484
--- /dev/null
+++ b/examples/file_reader_factory.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+  using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+      const std::string& file_name, bool error_tolerant);
+
+  FileReaderFactory() = delete;
+  FileReaderFactory(const FileReaderFactory&) = delete;
+  FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+  ~FileReaderFactory() = default;
+
+  // Registers the OpenFunction for a FileReaderInterface and returns true when
+  // registration succeeds.
+  static bool RegisterReader(OpenFunction open_function);
+
+  // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+  // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+  // returned. If |error_tolerant| is true and the reader supports it, some
+  // format and read errors may be ignored and partial data returned.
+  static std::unique_ptr<FileReaderInterface> OpenReader(
+      const std::string& file_name, bool error_tolerant = false);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/examples/file_reader_factory_test.cc b/examples/file_reader_factory_test.cc
new file mode 100644
index 0000000..346f9f8
--- /dev/null
+++ b/examples/file_reader_factory_test.cc
@@ -0,0 +1,114 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class AlwaysFailFileReader : public FileReaderInterface {
+ public:
+  static std::unique_ptr<FileReaderInterface> Open(
+      const std::string& /*file_name*/, bool /*error_tolerant*/) {
+    return nullptr;
+  }
+
+  AlwaysFailFileReader() = delete;
+  AlwaysFailFileReader(const AlwaysFailFileReader&) = delete;
+  AlwaysFailFileReader& operator=(const AlwaysFailFileReader&) = delete;
+  // Note this isn't overridden as the class can never be instantiated. This
+  // avoids an unused function warning.
+  // ~AlwaysFailFileReader() override = default;
+
+  bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+                        int64_t* /*pts*/) override {
+    return false;
+  }
+  bool IsEndOfFile() const override { return false; }
+
+  size_t width() const override { return 0; }
+  size_t height() const override { return 0; }
+  size_t frame_rate() const override { return 0; }
+  size_t time_scale() const override { return 0; }
+
+  static bool is_registered_;
+};
+
+class AlwaysOkFileReader : public FileReaderInterface {
+ public:
+  static std::unique_ptr<FileReaderInterface> Open(
+      const std::string& /*file_name*/, bool /*error_tolerant*/) {
+    auto reader = absl::WrapUnique(new (std::nothrow) AlwaysOkFileReader());
+
+    return reader;
+  }
+
+  AlwaysOkFileReader(const AlwaysOkFileReader&) = delete;
+  AlwaysOkFileReader& operator=(const AlwaysOkFileReader&) = delete;
+  ~AlwaysOkFileReader() override = default;
+
+  bool ReadTemporalUnit(std::vector<uint8_t>* /*data*/,
+                        int64_t* /*pts*/) override {
+    return true;
+  }
+  bool IsEndOfFile() const override { return true; }
+
+  size_t width() const override { return 1; }
+  size_t height() const override { return 1; }
+  size_t frame_rate() const override { return 1; }
+  size_t time_scale() const override { return 1; }
+
+  static bool is_registered_;
+
+ private:
+  AlwaysOkFileReader() = default;
+};
+
+bool AlwaysFailFileReader::is_registered_ =
+    FileReaderFactory::RegisterReader(AlwaysFailFileReader::Open);
+
+bool AlwaysOkFileReader::is_registered_ =
+    FileReaderFactory::RegisterReader(AlwaysOkFileReader::Open);
+
+TEST(FileReaderFactoryTest, RegistrationFail) {
+  EXPECT_FALSE(FileReaderFactory::RegisterReader(nullptr));
+}
+
+TEST(FileReaderFactoryTest, OpenReader) {
+  ASSERT_TRUE(AlwaysOkFileReader::is_registered_);
+  ASSERT_TRUE(AlwaysFailFileReader::is_registered_);
+
+  auto reader = FileReaderFactory::OpenReader("fake file");
+  EXPECT_NE(reader, nullptr);
+  EXPECT_TRUE(reader->IsEndOfFile());
+  EXPECT_TRUE(reader->ReadTemporalUnit(nullptr, nullptr));
+  EXPECT_EQ(reader->width(), 1);
+  EXPECT_EQ(reader->height(), 1);
+  EXPECT_EQ(reader->frame_rate(), 1);
+  EXPECT_EQ(reader->time_scale(), 1);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/file_reader_interface.h b/examples/file_reader_interface.h
new file mode 100644
index 0000000..d8f7030
--- /dev/null
+++ b/examples/file_reader_interface.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+  FileReaderInterface() = default;
+  FileReaderInterface(const FileReaderInterface&) = delete;
+  FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+  FileReaderInterface(FileReaderInterface&&) = default;
+  FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+  // Closes the file.
+  virtual ~FileReaderInterface() = default;
+
+  // Reads a temporal unit from the file and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp of the temporal unit.
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from file.
+  virtual size_t width() const = 0;
+  virtual size_t height() const = 0;
+  virtual size_t frame_rate() const = 0;
+  virtual size_t time_scale() const = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/examples/file_reader_test.cc b/examples/file_reader_test.cc
new file mode 100644
index 0000000..53e27f7
--- /dev/null
+++ b/examples/file_reader_test.cc
@@ -0,0 +1,126 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+#include "examples/file_reader_test_common.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// For use with tests that expect Open() failure to distinguish failure due to
+// the file contents versus failure due to a missing file.
+bool FileCanBeRead(const std::string& filename) {
+  FILE* const file = fopen(filename.c_str(), "r");
+  if (file != nullptr) {
+    fclose(file);
+    return true;
+  }
+  return false;
+}
+
+TEST(FileReaderTest, FailOpen) {
+  EXPECT_EQ(FileReader::Open(""), nullptr);
+  const std::string filename =
+      test_utils::GetTestInputFilePath("ivf-signature-only");
+  SCOPED_TRACE("Filename: " + filename);
+  EXPECT_TRUE(FileCanBeRead(filename));
+  EXPECT_EQ(FileReader::Open(filename), nullptr);
+}
+
+TEST(FileReaderTest, Open) {
+  const std::string filenames[] = {
+      test_utils::GetTestInputFilePath("five-frames.ivf"),
+      test_utils::GetTestInputFilePath("ivf-header-and-truncated-frame-header"),
+      test_utils::GetTestInputFilePath("ivf-header-only"),
+      test_utils::GetTestInputFilePath("one-frame-truncated.ivf"),
+      test_utils::GetTestInputFilePath("one-frame.ivf"),
+  };
+  for (const auto& filename : filenames) {
+    EXPECT_NE(FileReader::Open(filename), nullptr) << "Filename: " << filename;
+  }
+}
+
+TEST_P(FileReaderFailTest, FailRead) {
+  ASSERT_FALSE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+}
+
+TEST_P(FileReaderErrorTolerant, ReadThroughEndOfFile) {
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+    ASSERT_GT(tu_data_.size(), 0);
+  }
+}
+
+TEST_P(FileReaderTestNoTimeStamps, ReadThroughEndOfFile) {
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, nullptr));
+  }
+}
+
+TEST_P(FileReaderTestWithTimeStamps, ReadThroughEndOfFile) {
+  int64_t timestamp = 0;
+  while (!reader_->IsEndOfFile()) {
+    tu_data_.clear();
+    ASSERT_TRUE(reader_->ReadTemporalUnit(&tu_data_, &timestamp));
+    if (!tu_data_.empty()) {
+      last_timestamp_ = timestamp;
+    }
+  }
+  ASSERT_TRUE(tu_data_.empty());
+  ASSERT_EQ(last_timestamp_, expected_last_timestamp_);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    FailRead, FileReaderFailTest,
+    testing::Values(
+        FileReaderTestParameters(FileReader::Open,
+                                 "ivf-header-and-truncated-frame-header"),
+        FileReaderTestParameters(FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(ReadThroughEndOfFile, FileReaderErrorTolerant,
+                         testing::Values(FileReaderTestParameters(
+                             FileReader::Open, "one-frame-truncated.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+    ReadThroughEndOfFile, FileReaderTestNoTimeStamps,
+    testing::Values(FileReaderTestParameters(FileReader::Open, "one-frame.ivf"),
+                    FileReaderTestParameters(FileReader::Open,
+                                             "one-frame-large-timestamp.ivf"),
+                    FileReaderTestParameters(FileReader::Open,
+                                             "five-frames.ivf")));
+
+INSTANTIATE_TEST_SUITE_P(
+    ReadThroughEndOfFile, FileReaderTestWithTimeStamps,
+    testing::Values(
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "one-frame.ivf", 0),
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "one-frame-large-timestamp.ivf",
+                                               4294967296),
+        FileReaderTestWithTimeStampsParameters(FileReader::Open,
+                                               "five-frames.ivf", 4)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/file_reader_test_common.cc b/examples/file_reader_test_common.cc
new file mode 100644
index 0000000..735dd9e
--- /dev/null
+++ b/examples/file_reader_test_common.cc
@@ -0,0 +1,43 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_test_common.h"
+
+#include <ostream>
+
+#include "examples/file_reader.h"
+
+namespace libgav1 {
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileReaderTestParameters& parameters) {
+  stream << "open_function="
+         << ((parameters.open_function == FileReader::Open) ? "FileReader"
+                                                            : "Unknown")
+         << ", file_name=" << parameters.file_name;
+  return stream;
+}
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const FileReaderTestWithTimeStampsParameters& parameters) {
+  stream << "open_function="
+         << ((parameters.open_function == FileReader::Open) ? "FileReader"
+                                                            : "Unknown")
+         << ", file_name=" << parameters.file_name
+         << ", expected_last_timestamp=" << parameters.expected_last_timestamp;
+  return stream;
+}
+
+}  // namespace libgav1
diff --git a/examples/file_reader_test_common.h b/examples/file_reader_test_common.h
new file mode 100644
index 0000000..187a6ac
--- /dev/null
+++ b/examples/file_reader_test_common.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+
+struct FileReaderTestParameters {
+  FileReaderTestParameters() = default;
+  FileReaderTestParameters(FileReaderFactory::OpenFunction open_function,
+                           const char* file_name)
+      : open_function(open_function), file_name(file_name) {}
+  FileReaderTestParameters(const FileReaderTestParameters&) = default;
+  FileReaderTestParameters& operator=(const FileReaderTestParameters&) = delete;
+  FileReaderTestParameters(FileReaderTestParameters&&) = default;
+  FileReaderTestParameters& operator=(FileReaderTestParameters&&) = default;
+  ~FileReaderTestParameters() = default;
+
+  FileReaderFactory::OpenFunction open_function = nullptr;
+  const char* file_name = nullptr;
+};
+
+class FileReaderTestBase {
+ public:
+  FileReaderTestBase() = default;
+  FileReaderTestBase(const FileReaderTestBase&) = delete;
+  FileReaderTestBase& operator=(const FileReaderTestBase&) = delete;
+  FileReaderTestBase(FileReaderTestBase&&) = default;
+  FileReaderTestBase& operator=(FileReaderTestBase&&) = default;
+  ~FileReaderTestBase() = default;
+
+ protected:
+  void OpenReader(const char* file_name,
+                  FileReaderFactory::OpenFunction open_function) {
+    file_name_ = test_utils::GetTestInputFilePath(file_name);
+    reader_ = open_function(file_name_, /*error_tolerant=*/false);
+    ASSERT_NE(reader_, nullptr);
+  }
+
+  std::string file_name_;
+  std::unique_ptr<FileReaderInterface> reader_;
+  std::vector<uint8_t> tu_data_;
+};
+
+class FileReaderFailTest
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderFailTest() = default;
+  FileReaderFailTest(const FileReaderTestBase&) = delete;
+  FileReaderFailTest& operator=(const FileReaderTestBase&) = delete;
+  ~FileReaderFailTest() override = default;
+
+ protected:
+  void SetUp() override {
+    OpenReader(GetParam().file_name, GetParam().open_function);
+  }
+};
+
+class FileReaderTestNoTimeStamps
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderTestNoTimeStamps() = default;
+  FileReaderTestNoTimeStamps(const FileReaderTestNoTimeStamps&) = delete;
+  FileReaderTestNoTimeStamps& operator=(const FileReaderTestNoTimeStamps&) =
+      delete;
+  ~FileReaderTestNoTimeStamps() override = default;
+
+ protected:
+  void SetUp() override {
+    OpenReader(GetParam().file_name, GetParam().open_function);
+  }
+};
+
+class FileReaderErrorTolerant
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestParameters> {
+ public:
+  FileReaderErrorTolerant() = default;
+  FileReaderErrorTolerant(const FileReaderErrorTolerant&) = delete;
+  FileReaderErrorTolerant& operator=(const FileReaderErrorTolerant&) = delete;
+  ~FileReaderErrorTolerant() override = default;
+
+ protected:
+  void SetUp() override {
+    file_name_ = test_utils::GetTestInputFilePath(GetParam().file_name);
+    reader_ = GetParam().open_function(file_name_, /*error_tolerant=*/true);
+    ASSERT_NE(reader_, nullptr);
+  }
+};
+
+struct FileReaderTestWithTimeStampsParameters {
+  FileReaderTestWithTimeStampsParameters() = default;
+  FileReaderTestWithTimeStampsParameters(
+      FileReaderFactory::OpenFunction open_function, const char* file_name,
+      int64_t expected_last_timestamp)
+      : open_function(open_function),
+        file_name(file_name),
+        expected_last_timestamp(expected_last_timestamp) {}
+  FileReaderTestWithTimeStampsParameters(
+      const FileReaderTestWithTimeStampsParameters&) = default;
+  FileReaderTestWithTimeStampsParameters& operator=(
+      const FileReaderTestWithTimeStampsParameters&) = delete;
+  FileReaderTestWithTimeStampsParameters(
+      FileReaderTestWithTimeStampsParameters&&) = default;
+  FileReaderTestWithTimeStampsParameters& operator=(
+      FileReaderTestWithTimeStampsParameters&&) = default;
+  ~FileReaderTestWithTimeStampsParameters() = default;
+
+  FileReaderFactory::OpenFunction open_function = nullptr;
+  const char* file_name = nullptr;
+  int64_t expected_last_timestamp = 0;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileReaderTestParameters& parameters);
+
+std::ostream& operator<<(
+    std::ostream& stream,
+    const FileReaderTestWithTimeStampsParameters& parameters);
+
+class FileReaderTestWithTimeStamps
+    : public FileReaderTestBase,
+      public testing::TestWithParam<FileReaderTestWithTimeStampsParameters> {
+ public:
+  FileReaderTestWithTimeStamps() = default;
+  FileReaderTestWithTimeStamps(const FileReaderTestWithTimeStamps&) = delete;
+  FileReaderTestWithTimeStamps& operator=(const FileReaderTestWithTimeStamps&) =
+      delete;
+  ~FileReaderTestWithTimeStamps() override = default;
+
+ protected:
+  void SetUp() override {
+    FileReaderTestWithTimeStampsParameters parameters = GetParam();
+    OpenReader(parameters.file_name, parameters.open_function);
+    expected_last_timestamp_ = parameters.expected_last_timestamp;
+  }
+
+  int64_t last_timestamp_ = 0;
+  int64_t expected_last_timestamp_ = 0;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_TEST_COMMON_H_
diff --git a/examples/file_writer.cc b/examples/file_writer.cc
new file mode 100644
index 0000000..54afe14
--- /dev/null
+++ b/examples/file_writer.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+std::string GetY4mColorSpaceString(
+    const FileWriter::Y4mParameters& y4m_parameters) {
+  std::string color_space_string;
+  switch (y4m_parameters.image_format) {
+    case kImageFormatMonochrome400:
+      color_space_string = "mono";
+      break;
+    case kImageFormatYuv420:
+      if (y4m_parameters.bitdepth == 8) {
+        if (y4m_parameters.chroma_sample_position ==
+            kChromaSamplePositionVertical) {
+          color_space_string = "420mpeg2";
+        } else if (y4m_parameters.chroma_sample_position ==
+                   kChromaSamplePositionColocated) {
+          color_space_string = "420";
+        } else {
+          color_space_string = "420jpeg";
+        }
+      } else {
+        color_space_string = "420";
+      }
+      break;
+    case kImageFormatYuv422:
+      color_space_string = "422";
+      break;
+    case kImageFormatYuv444:
+      color_space_string = "444";
+      break;
+  }
+
+  if (y4m_parameters.bitdepth > 8) {
+    const bool monochrome =
+        y4m_parameters.image_format == kImageFormatMonochrome400;
+    if (!monochrome) color_space_string += "p";
+    color_space_string += std::to_string(y4m_parameters.bitdepth);
+  }
+
+  return color_space_string;
+}
+
+}  // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+    const std::string& file_name, FileType file_type,
+    const Y4mParameters* const y4m_parameters) {
+  if (file_name.empty() ||
+      (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+      (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+    return nullptr;
+  }
+
+  FILE* raw_file_ptr;
+
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdout);
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "wb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+    return nullptr;
+  }
+
+  std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+    return nullptr;
+  }
+
+  file->file_type_ = file_type;
+  return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+  if (file_type_ == kFileTypeY4m) {
+    const char kY4mFrameHeader[] = "FRAME\n";
+    if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+        strlen(kY4mFrameHeader)) {
+      LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+      return false;
+    }
+  }
+
+  const size_t pixel_size =
+      (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+       ++plane_index) {
+    const int height = frame_buffer.displayed_height[plane_index];
+    const int width = frame_buffer.displayed_width[plane_index];
+    const int stride = frame_buffer.stride[plane_index];
+    const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+    for (int row = 0; row < height; ++row) {
+      const uint8_t* const row_pointer = &plane_pointer[row * stride];
+      if (fwrite(row_pointer, pixel_size, width, file_) !=
+          static_cast<size_t>(width)) {
+        char error_string[256];
+        snprintf(error_string, sizeof(error_string),
+                 "File write failed: %s (errno=%d)", strerror(errno), errno);
+        LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+  std::string y4m_header = "YUV4MPEG2";
+  y4m_header += " W" + std::to_string(y4m_parameters.width);
+  y4m_header += " H" + std::to_string(y4m_parameters.height);
+  y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+                ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+  y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+  y4m_header += "\n";
+  return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+         y4m_header.length();
+}
+
+}  // namespace libgav1
diff --git a/examples/file_writer.h b/examples/file_writer.h
new file mode 100644
index 0000000..00f6cc3
--- /dev/null
+++ b/examples/file_writer.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+  enum FileType : uint8_t {
+    kFileTypeRaw,
+    kFileTypeY4m,
+  };
+
+  struct Y4mParameters {
+    Y4mParameters() = default;
+    Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+                  size_t frame_rate_denominator,
+                  ChromaSamplePosition chroma_sample_position,
+                  ImageFormat image_format, size_t bitdepth)
+        : width(width),
+          height(height),
+          frame_rate_numerator(frame_rate_numerator),
+          frame_rate_denominator(frame_rate_denominator),
+          chroma_sample_position(chroma_sample_position),
+          image_format(image_format),
+          bitdepth(bitdepth) {}
+
+    Y4mParameters(const Y4mParameters& rhs) = default;
+    Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+    Y4mParameters(Y4mParameters&& rhs) = default;
+    Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+    size_t width = 0;
+    size_t height = 0;
+    size_t frame_rate_numerator = 30;
+    size_t frame_rate_denominator = 1;
+    ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+    ImageFormat image_format = kImageFormatYuv420;
+    size_t bitdepth = 8;
+  };
+
+  // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+  // written out to |file_| before this method returns.
+  //
+  // Returns a FileWriter instance after the file is opened successfully for
+  // kFileTypeRaw files, and after the Y4M file header bytes are written for
+  // kFileTypeY4m files. Returns nullptr upon failure.
+  static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+                                          FileType type,
+                                          const Y4mParameters* y4m_parameters);
+
+  FileWriter() = delete;
+  FileWriter(const FileWriter&) = delete;
+  FileWriter& operator=(const FileWriter&) = delete;
+
+  FileWriter(FileWriter&&) = default;
+  FileWriter& operator=(FileWriter&&) = default;
+
+  // Closes |file_|.
+  ~FileWriter();
+
+  // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+  // successful write of |frame_buffer| data.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+      const DecoderBuffer& frame_buffer);
+
+ private:
+  explicit FileWriter(FILE* file) : file_(file) {}
+
+  bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+  FILE* file_ = nullptr;
+  FileType file_type_ = kFileTypeRaw;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/examples/file_writer_test.cc b/examples/file_writer_test.cc
new file mode 100644
index 0000000..481808c
--- /dev/null
+++ b/examples/file_writer_test.cc
@@ -0,0 +1,495 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "gav1/decoder_buffer.h"
+#include "gtest/gtest.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+const char kExpectedY4mHeader8bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420jpeg\n";
+const char kExpectedY4mHeader10bit[] = "YUV4MPEG2 W352 H288 F30:1 Ip C420p10\n";
+const char kExpectedY4mHeader8bitMonochrome[] =
+    "YUV4MPEG2 W352 H288 F30:1 Ip Cmono\n";
+const char kExpectedY4mHeader10bitMonochrome[] =
+    "YUV4MPEG2 W352 H288 F30:1 Ip Cmono10\n";
+
+// Note: These are non-const because DecoderBuffer.plane is non-const.
+char fake_plane0[] = "PLANE0\n";
+char fake_plane1[] = "PLANE1\n";
+char fake_plane2[] = "PLANE2\n";
+
+constexpr size_t kExpectedRawDataBufferCount = 3;
+const char* kExpectedRawData[kExpectedRawDataBufferCount] = {
+    fake_plane0, fake_plane1, fake_plane2};
+
+const char* const kExpectedRawDataMonochrome = fake_plane0;
+
+constexpr size_t kExpectedY4mDataBufferCount = 5;
+const char* const kExpectedY4mFileData8bit[kExpectedY4mDataBufferCount] = {
+    kExpectedY4mHeader8bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+const char* const kExpectedY4mFileData10bit[kExpectedY4mDataBufferCount] = {
+    kExpectedY4mHeader10bit, "FRAME\n", fake_plane0, fake_plane1, fake_plane2};
+
+constexpr size_t kExpectedY4mDataBufferCountMonochrome = 3;
+const char* const
+    kExpectedY4mFileData8bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+        {kExpectedY4mHeader8bitMonochrome, "FRAME\n", fake_plane0};
+const char* const
+    kExpectedY4mFileData10bitMonochrome[kExpectedY4mDataBufferCountMonochrome] =
+        {kExpectedY4mHeader10bitMonochrome, "FRAME\n", fake_plane0};
+
+// TODO(tomfinegan): Add a bitdepth arg, and test writing 10 bit frame buffers.
+std::unique_ptr<DecoderBuffer> GetFakeDecoderBuffer(ImageFormat image_format) {
+  auto buffer = absl::make_unique<DecoderBuffer>();
+  if (buffer == nullptr) return nullptr;
+  buffer->chroma_sample_position = kChromaSamplePositionUnknown;
+  buffer->image_format = image_format;
+  buffer->bitdepth = 8;
+  buffer->displayed_width[0] = static_cast<int>(strlen(fake_plane0));
+  buffer->displayed_width[1] = static_cast<int>(strlen(fake_plane1));
+  buffer->displayed_width[2] = static_cast<int>(strlen(fake_plane2));
+  buffer->displayed_height[0] = 1;
+  buffer->displayed_height[1] = 1;
+  buffer->displayed_height[2] = 1;
+  buffer->stride[0] = static_cast<int>(strlen(fake_plane0));
+  buffer->stride[1] = static_cast<int>(strlen(fake_plane1));
+  buffer->stride[2] = static_cast<int>(strlen(fake_plane2));
+  buffer->plane[0] = reinterpret_cast<uint8_t*>(fake_plane0);
+  buffer->plane[1] = reinterpret_cast<uint8_t*>(fake_plane1);
+  buffer->plane[2] = reinterpret_cast<uint8_t*>(fake_plane2);
+  buffer->user_private_data = 0;
+  buffer->buffer_private_data = nullptr;
+  return buffer;
+}
+
+TEST(FileWriterTest, FailOpen) {
+  EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+                             static_cast<FileWriter::FileType>(3), nullptr),
+            nullptr);
+  EXPECT_EQ(FileWriter::Open(test_utils::GetTestOutputFilePath("fail_open"),
+                             FileWriter::kFileTypeY4m, nullptr),
+            nullptr);
+}
+
+struct FileWriterY4mHeaderTestParameters {
+  FileWriterY4mHeaderTestParameters() = default;
+  FileWriterY4mHeaderTestParameters(const FileWriterY4mHeaderTestParameters&) =
+      default;
+  FileWriterY4mHeaderTestParameters& operator=(
+      const FileWriterY4mHeaderTestParameters&) = default;
+  FileWriterY4mHeaderTestParameters(FileWriterY4mHeaderTestParameters&&) =
+      default;
+  FileWriterY4mHeaderTestParameters& operator=(
+      FileWriterY4mHeaderTestParameters&&) = default;
+  ~FileWriterY4mHeaderTestParameters() = default;
+
+  FileWriterY4mHeaderTestParameters(std::string file_name,
+                                    ChromaSamplePosition chroma_sample_position,
+                                    ImageFormat image_format, int bitdepth,
+                                    const char* expected_header_string)
+      : file_name(std::move(file_name)),
+        chroma_sample_position(chroma_sample_position),
+        image_format(image_format),
+        bitdepth(bitdepth),
+        expected_header_string(expected_header_string) {}
+  std::string file_name;
+  ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+  ImageFormat image_format = kImageFormatMonochrome400;
+  int bitdepth = 8;
+  const char* expected_header_string = nullptr;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriterY4mHeaderTestParameters& parameters) {
+  stream << "file_name=" << parameters.file_name << "\n"
+         << "chroma_sample_position=" << parameters.chroma_sample_position
+         << "\n"
+         << "image_format=" << parameters.image_format << "\n"
+         << "bitdepth=" << parameters.bitdepth << "\n"
+         << "expected_header_string=" << parameters.expected_header_string
+         << "\n";
+  return stream;
+}
+
+class FileWriterY4mHeaderTest
+    : public testing::TestWithParam<FileWriterY4mHeaderTestParameters> {
+ public:
+  FileWriterY4mHeaderTest() {
+    test_parameters_ = GetParam();
+    y4m_parameters_.width = 352;
+    y4m_parameters_.height = 288;
+    y4m_parameters_.frame_rate_numerator = 30;
+    y4m_parameters_.frame_rate_denominator = 1;
+    y4m_parameters_.chroma_sample_position =
+        test_parameters_.chroma_sample_position;
+    y4m_parameters_.image_format = test_parameters_.image_format;
+    y4m_parameters_.bitdepth = test_parameters_.bitdepth;
+  }
+  FileWriterY4mHeaderTest(const FileWriterY4mHeaderTest&) = delete;
+  FileWriterY4mHeaderTest& operator=(const FileWriterY4mHeaderTest&) = delete;
+  ~FileWriterY4mHeaderTest() override = default;
+
+ protected:
+  FileWriterY4mHeaderTestParameters test_parameters_;
+  FileWriter::Y4mParameters y4m_parameters_;
+};
+
+TEST_P(FileWriterY4mHeaderTest, WriteY4mHeader) {
+  const std::string file_name =
+      test_utils::GetTestOutputFilePath(test_parameters_.file_name);
+  EXPECT_NE(
+      FileWriter::Open(file_name, FileWriter::kFileTypeY4m, &y4m_parameters_),
+      nullptr);
+  std::string y4m_header_string;
+  test_utils::GetTestData(test_parameters_.file_name, true, &y4m_header_string);
+  EXPECT_STREQ(y4m_header_string.c_str(),
+               test_parameters_.expected_header_string);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteY4mHeader, FileWriterY4mHeaderTest,
+    testing::Values(
+        FileWriterY4mHeaderTestParameters(
+            "y4m_header_8bit", kChromaSamplePositionUnknown, kImageFormatYuv420,
+            /*bitdepth=*/8, kExpectedY4mHeader8bit),
+        FileWriterY4mHeaderTestParameters("y4m_header_10bit",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatYuv420, /*bitdepth=*/10,
+                                          kExpectedY4mHeader10bit),
+        FileWriterY4mHeaderTestParameters("y4m_header_8bit_monochrome",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatMonochrome400,
+                                          /*bitdepth=*/8,
+                                          kExpectedY4mHeader8bitMonochrome),
+        FileWriterY4mHeaderTestParameters("y4m_header_10bit_monochrome",
+                                          kChromaSamplePositionUnknown,
+                                          kImageFormatMonochrome400,
+                                          /*bitdepth=*/10,
+                                          kExpectedY4mHeader10bitMonochrome)));
+
+struct FileWriterTestParameters {
+  FileWriterTestParameters() = default;
+  FileWriterTestParameters(const FileWriterTestParameters&) = default;
+  FileWriterTestParameters& operator=(const FileWriterTestParameters&) =
+      default;
+  FileWriterTestParameters(FileWriterTestParameters&&) = default;
+  FileWriterTestParameters& operator=(FileWriterTestParameters&&) = default;
+  ~FileWriterTestParameters() = default;
+
+  FileWriterTestParameters(std::string file_name,
+                           FileWriter::FileType file_type,
+                           const FileWriter::Y4mParameters* y4m_parameters,
+                           size_t num_frames)
+      : file_name(std::move(file_name)),
+        file_type(file_type),
+        y4m_parameters(y4m_parameters),
+        num_frames(num_frames) {}
+  std::string file_name;
+  FileWriter::FileType file_type = FileWriter::kFileTypeRaw;
+  const FileWriter::Y4mParameters* y4m_parameters = nullptr;
+  size_t num_frames = 1;
+};
+
+std::ostream& operator<<(std::ostream& stream,
+                         const ChromaSamplePosition& position) {
+  switch (position) {
+    case kChromaSamplePositionUnknown:
+      stream << "kCromaSamplePositionUnknown";
+      break;
+    case kChromaSamplePositionVertical:
+      stream << "kChromaSamplePositionVertical";
+      break;
+    case kChromaSamplePositionColocated:
+      stream << "kChromaSamplePositionColocated";
+      break;
+    case kChromaSamplePositionReserved:
+      stream << "kChromaSamplePositionReserved";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const ImageFormat& image_format) {
+  switch (image_format) {
+    case kImageFormatMonochrome400:
+      stream << "kImageFormatMonochrome400";
+      break;
+    case kImageFormatYuv420:
+      stream << "kImageFormatYuv420";
+      break;
+    case kImageFormatYuv422:
+      stream << "kImageFormatYuv422";
+      break;
+    case kImageFormatYuv444:
+      stream << "kImageFormatYuv444";
+      break;
+  }
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriter::Y4mParameters& parameters) {
+  stream << "y4m_parameters:\n"
+         << "  width=" << parameters.width << "\n"
+         << "  height=" << parameters.height << "\n"
+         << "  frame_rate_numerator=" << parameters.frame_rate_numerator << "\n"
+         << "  frame_rate_denominator=" << parameters.frame_rate_denominator
+         << "\n"
+         << "  chroma_sample_position=" << parameters.chroma_sample_position
+         << "\n"
+         << "  image_format=" << parameters.image_format << "\n"
+         << "  bitdepth=" << parameters.bitdepth << "\n";
+
+  return stream;
+}
+
+std::ostream& operator<<(std::ostream& stream,
+                         const FileWriterTestParameters& parameters) {
+  stream << "file_name=" << parameters.file_name << "\n"
+         << "file_type="
+         << (parameters.file_type == FileWriter::kFileTypeRaw ? "kFileTypeRaw"
+                                                              : "kFileTypeY4m")
+         << "\n";
+  if (parameters.y4m_parameters != nullptr) {
+    stream << *parameters.y4m_parameters;
+  } else {
+    stream << "y4m_parameters: <nullptr>\n";
+  }
+  stream << "num_frames=" << parameters.num_frames << "\n";
+  return stream;
+}
+
+class FileWriterTestBase
+    : public testing::TestWithParam<FileWriterTestParameters> {
+ public:
+  FileWriterTestBase() = default;
+  FileWriterTestBase(const FileWriterTestBase&) = delete;
+  FileWriterTestBase& operator=(const FileWriterTestBase&) = delete;
+  ~FileWriterTestBase() override = default;
+
+ protected:
+  void SetUp() override { OpenWriter(GetParam()); }
+
+  void OpenWriter(const FileWriterTestParameters& parameters) {
+    parameters_ = parameters;
+    parameters_.file_name = parameters.file_name;
+    file_writer_ = FileWriter::Open(
+        test_utils::GetTestOutputFilePath(parameters.file_name),
+        parameters_.file_type, parameters_.y4m_parameters);
+    ASSERT_NE(file_writer_, nullptr);
+  }
+
+  void WriteFramesAndCloseFile() {
+    if (parameters_.y4m_parameters != nullptr) {
+      image_format_ = parameters_.y4m_parameters->image_format;
+    }
+    decoder_buffer_ = GetFakeDecoderBuffer(image_format_);
+    for (size_t frame_num = 0; frame_num < parameters_.num_frames;
+         ++frame_num) {
+      ASSERT_TRUE(file_writer_->WriteFrame(*decoder_buffer_));
+    }
+    file_writer_ = nullptr;
+  }
+
+  ImageFormat image_format_ = kImageFormatYuv420;
+  FileWriterTestParameters parameters_;
+  std::unique_ptr<FileWriter> file_writer_;
+  std::unique_ptr<DecoderBuffer> decoder_buffer_;
+};
+
+class FileWriterTestRaw : public FileWriterTestBase {
+ public:
+  FileWriterTestRaw() = default;
+  FileWriterTestRaw(const FileWriterTestRaw&) = delete;
+  FileWriterTestRaw& operator=(const FileWriterTestRaw&) = delete;
+  ~FileWriterTestRaw() override = default;
+
+ protected:
+  void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+class FileWriterTestY4m : public FileWriterTestBase {
+ public:
+  FileWriterTestY4m() = default;
+  FileWriterTestY4m(const FileWriterTestY4m&) = delete;
+  FileWriterTestY4m& operator=(const FileWriterTestY4m&) = delete;
+  ~FileWriterTestY4m() override = default;
+
+ protected:
+  void SetUp() override { FileWriterTestBase::SetUp(); }
+};
+
+TEST_P(FileWriterTestRaw, WriteRawFrames) {
+  WriteFramesAndCloseFile();
+
+  std::string actual_file_data;
+  test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+  std::string expected_file_data;
+  for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+    if (image_format_ == kImageFormatMonochrome400) {
+      expected_file_data += kExpectedRawDataMonochrome;
+    } else {
+      for (const auto& buffer : kExpectedRawData) {
+        expected_file_data += buffer;
+      }
+    }
+  }
+
+  ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+TEST_P(FileWriterTestY4m, WriteY4mFrames) {
+  WriteFramesAndCloseFile();
+
+  std::string actual_file_data;
+  test_utils::GetTestData(parameters_.file_name, true, &actual_file_data);
+
+  std::string expected_file_data;
+  for (size_t frame_num = 0; frame_num < parameters_.num_frames; ++frame_num) {
+    if (image_format_ == kImageFormatMonochrome400) {
+      const char* const* expected_data_planes =
+          (parameters_.y4m_parameters->bitdepth == 8)
+              ? kExpectedY4mFileData8bitMonochrome
+              : kExpectedY4mFileData10bitMonochrome;
+      // Skip the Y4M file header "plane" after frame 0.
+      for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+           buffer_num < kExpectedY4mDataBufferCountMonochrome; ++buffer_num) {
+        expected_file_data += expected_data_planes[buffer_num];
+      }
+    } else {
+      const char* const* expected_data_planes =
+          (parameters_.y4m_parameters->bitdepth == 8)
+              ? kExpectedY4mFileData8bit
+              : kExpectedY4mFileData10bit;
+
+      // Skip the Y4M file header "plane" after frame 0.
+      for (size_t buffer_num = (frame_num == 0) ? 0 : 1;
+           buffer_num < kExpectedY4mDataBufferCount; ++buffer_num) {
+        expected_file_data += expected_data_planes[buffer_num];
+      }
+    }
+  }
+
+  ASSERT_EQ(actual_file_data, expected_file_data);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteRawFrames, FileWriterTestRaw,
+    testing::Values(
+        FileWriterTestParameters("raw_frames_test_1frame",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("raw_frames_test_5frames",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("raw_frames_test_1frame_monochrome",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("raw_frames_test_5frames_monochrome",
+                                 FileWriter::kFileTypeRaw,
+                                 /*y4m_parameters=*/nullptr,
+                                 /*num_frames=*/5)));
+
+const FileWriter::Y4mParameters kY4mParameters8Bit = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatYuv420,
+    8  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10Bit = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatYuv420,
+    10  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters8BitMonochrome = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatMonochrome400,
+    8  // bitdepth
+};
+
+const FileWriter::Y4mParameters kY4mParameters10BitMonochrome = {
+    352,  // width
+    288,  // height
+    30,   // frame_rate_numerator
+    1,    // frame_rate_denominator
+    kChromaSamplePositionUnknown,
+    kImageFormatMonochrome400,
+    10  // bitdepth
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    WriteY4mFrames, FileWriterTestY4m,
+    testing::Values(
+        FileWriterTestParameters("y4m_frames_test_8bit_1frame",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_8bit_5frames",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters8Bit,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_10bit_1frame",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_10bit_5frames",
+                                 FileWriter::kFileTypeY4m, &kY4mParameters10Bit,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_8bit_1frame_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters8BitMonochrome,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_8bit_5frames_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters8BitMonochrome,
+                                 /*num_frames=*/5),
+        FileWriterTestParameters("y4m_frames_test_10bit_1frame_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters10BitMonochrome,
+                                 /*num_frames=*/1),
+        FileWriterTestParameters("y4m_frames_test_10bit_5frames_monochrome",
+                                 FileWriter::kFileTypeY4m,
+                                 &kY4mParameters10BitMonochrome,
+                                 /*num_frames=*/5)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc
new file mode 100644
index 0000000..1408e8c
--- /dev/null
+++ b/examples/gav1_decode.cc
@@ -0,0 +1,455 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+  const char* input_file_name = nullptr;
+  const char* output_file_name = nullptr;
+  const char* frame_timing_file_name = nullptr;
+  libgav1::FileWriter::FileType output_file_type =
+      libgav1::FileWriter::kFileTypeRaw;
+  uint8_t post_filter_mask = 0x1f;
+  int threads = 1;
+  bool frame_parallel = false;
+  bool output_all_layers = false;
+  int operating_point = 0;
+  int limit = 0;
+  int skip = 0;
+  int verbose = 0;
+};
+
+struct Timing {
+  absl::Duration input;
+  absl::Duration dequeue;
+};
+
+struct FrameTiming {
+  absl::Time enqueue;
+  absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+  fprintf(fout,
+          "Usage: gav1_decode [options] <input file>"
+          " [-o <output file>]\n");
+  fprintf(fout, "\n");
+  fprintf(fout, "Options:\n");
+  fprintf(fout, "  -h, --help This help message.\n");
+  fprintf(fout, "  --threads <positive integer> (Default 1).\n");
+  fprintf(fout, "  --frame_parallel.\n");
+  fprintf(fout,
+          "  --limit <integer> Stop decoding after N frames (0 = all).\n");
+  fprintf(fout, "  --skip <integer> Skip initial N frames (Default 0).\n");
+  fprintf(fout, "  --version.\n");
+  fprintf(fout, "  --y4m (Default false).\n");
+  fprintf(fout, "  --raw (Default true).\n");
+  fprintf(fout, "  -v logging verbosity, can be used multiple times.\n");
+  fprintf(fout, "  --all_layers.\n");
+  fprintf(fout,
+          "  --operating_point <integer between 0 and 31> (Default 0).\n");
+  fprintf(fout,
+          "  --frame_timing <file> Output per-frame timing to <file> in tsv"
+          " format.\n   Yields meaningful results only when frame parallel is"
+          " off.\n");
+  fprintf(fout, "\nAdvanced settings:\n");
+  fprintf(fout, "  --post_filter_mask <integer> (Default 0x1f).\n");
+  fprintf(fout,
+          "   Mask indicating which post filters should be applied to the"
+          " reconstructed\n   frame. This may be given as octal, decimal or"
+          " hexadecimal. From LSB:\n");
+  fprintf(fout, "     Bit 0: Loop filter (deblocking filter)\n");
+  fprintf(fout, "     Bit 1: Cdef\n");
+  fprintf(fout, "     Bit 2: SuperRes\n");
+  fprintf(fout, "     Bit 3: Loop Restoration\n");
+  fprintf(fout, "     Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+  for (int i = 1; i < argc; ++i) {
+    int32_t value;
+    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+      PrintHelp(stdout);
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-o") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '-o'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->output_file_name = argv[i];
+    } else if (strcmp(argv[i], "--frame_timing") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '--frame_timing'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->frame_timing_file_name = argv[i];
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("gav1_decode, a libgav1 based AV1 decoder\n");
+      printf("libgav1 %s\n", libgav1::GetVersionString());
+      printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+      printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-v") == 0) {
+      ++options->verbose;
+    } else if (strcmp(argv[i], "--raw") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+    } else if (strcmp(argv[i], "--y4m") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+        fprintf(stderr, "Missing/Invalid value for --threads.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->threads = value;
+    } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+      options->frame_parallel = true;
+    } else if (strcmp(argv[i], "--all_layers") == 0) {
+      options->output_all_layers = true;
+    } else if (strcmp(argv[i], "--operating_point") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+          value >= 32) {
+        fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->operating_point = value;
+    } else if (strcmp(argv[i], "--limit") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --limit.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->limit = value;
+    } else if (strcmp(argv[i], "--skip") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --skip.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->skip = value;
+    } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+      errno = 0;
+      char* endptr = nullptr;
+      value = (++i >= argc) ? -1
+                            // NOLINTNEXTLINE(runtime/deprecated_fn)
+                            : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+      // Only the last 5 bits of the mask can be set.
+      if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+        fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->post_filter_mask = value;
+    } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+      fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    } else {
+      if (options->input_file_name == nullptr) {
+        options->input_file_name = argv[i];
+      } else {
+        fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  if (argc < 2 || options->input_file_name == nullptr) {
+    fprintf(stderr, "Input file is required!\n");
+    PrintHelp(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+  ~InputBuffers() {
+    for (auto buffer : free_buffers_) {
+      delete buffer;
+    }
+  }
+  InputBuffer* GetFreeBuffer() {
+    if (free_buffers_.empty()) {
+      auto* const buffer = new (std::nothrow) InputBuffer();
+      if (buffer == nullptr) {
+        fprintf(stderr, "Failed to create input buffer.\n");
+        return nullptr;
+      }
+      free_buffers_.push_back(buffer);
+    }
+    InputBuffer* const buffer = free_buffers_.front();
+    free_buffers_.pop_front();
+    return buffer;
+  }
+
+  void ReleaseInputBuffer(InputBuffer* buffer) {
+    free_buffers_.push_back(buffer);
+  }
+
+ private:
+  std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+  input_buffers->ReleaseInputBuffer(
+      static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  Options options;
+  ParseOptions(argc, argv, &options);
+
+  auto file_reader =
+      libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+  if (file_reader == nullptr) {
+    fprintf(stderr, "Cannot open input file!\n");
+    return EXIT_FAILURE;
+  }
+
+  std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+                                                                &CloseFile);
+  if (options.frame_timing_file_name != nullptr) {
+    frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+    if (frame_timing_file == nullptr) {
+      fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+              options.frame_timing_file_name);
+      return EXIT_FAILURE;
+    }
+  }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  // Reference frames + 1 scratch frame (for either the current frame or the
+  // film grain frame).
+  constexpr int kNumBuffers = 8 + 1;
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+      Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+  if (cv_pixel_buffers == nullptr) {
+    fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+    return EXIT_FAILURE;
+  }
+#endif
+
+  InputBuffers input_buffers;
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings;
+  settings.post_filter_mask = options.post_filter_mask;
+  settings.threads = options.threads;
+  settings.frame_parallel = options.frame_parallel;
+  settings.output_all_layers = options.output_all_layers;
+  settings.operating_point = options.operating_point;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+  settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+  settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+  settings.callback_private_data = cv_pixel_buffers.get();
+  settings.release_input_buffer = nullptr;
+  // TODO(vigneshv): Support frame parallel mode to be used with
+  // CVPixelBufferPool.
+  settings.frame_parallel = false;
+#endif
+  libgav1::StatusCode status = decoder.Init(&settings);
+  if (status != libgav1::kStatusOk) {
+    fprintf(stderr, "Error initializing decoder: %s\n",
+            libgav1::GetErrorString(status));
+    return EXIT_FAILURE;
+  }
+
+  fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+  if (options.verbose > 0 && options.skip > 0) {
+    fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+  }
+
+  int input_frames = 0;
+  int decoded_frames = 0;
+  Timing timing = {};
+  std::vector<FrameTiming> frame_timing;
+  const bool record_frame_timing = frame_timing_file != nullptr;
+  std::unique_ptr<libgav1::FileWriter> file_writer;
+  InputBuffer* input_buffer = nullptr;
+  bool limit_reached = false;
+  bool dequeue_finished = false;
+  const absl::Time decode_loop_start = absl::Now();
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+        !limit_reached) {
+      input_buffer = input_buffers.GetFreeBuffer();
+      if (input_buffer == nullptr) return EXIT_FAILURE;
+      const absl::Time read_start = absl::Now();
+      if (!file_reader->ReadTemporalUnit(input_buffer,
+                                         /*timestamp=*/nullptr)) {
+        fprintf(stderr, "Error reading input file.\n");
+        return EXIT_FAILURE;
+      }
+      timing.input += absl::Now() - read_start;
+    }
+
+    if (++input_frames <= options.skip) {
+      input_buffers.ReleaseInputBuffer(input_buffer);
+      input_buffer = nullptr;
+      continue;
+    }
+
+    if (input_buffer != nullptr) {
+      if (input_buffer->empty()) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+        input_buffer = nullptr;
+        continue;
+      }
+
+      const absl::Time enqueue_start = absl::Now();
+      status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                                    static_cast<int64_t>(frame_timing.size()),
+                                    /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        if (options.verbose > 1) {
+          fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+        }
+        if (record_frame_timing) {
+          FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+          frame_timing.emplace_back(enqueue_time);
+        }
+
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        fprintf(stderr, "Unable to enqueue frame: %s\n",
+                libgav1::GetErrorString(status));
+        return EXIT_FAILURE;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+      continue;
+    }
+    if (status != libgav1::kStatusOk) {
+      fprintf(stderr, "Unable to dequeue frame: %s\n",
+              libgav1::GetErrorString(status));
+      return EXIT_FAILURE;
+    }
+    dequeue_finished = false;
+    if (buffer == nullptr) continue;
+    ++decoded_frames;
+    if (options.verbose > 1) {
+      fprintf(stderr, "buffer dequeued\n");
+    }
+
+    if (record_frame_timing) {
+      frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+          absl::Now();
+    }
+
+    if (options.output_file_name != nullptr && file_writer == nullptr) {
+      libgav1::FileWriter::Y4mParameters y4m_parameters;
+      y4m_parameters.width = buffer->displayed_width[0];
+      y4m_parameters.height = buffer->displayed_height[0];
+      y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+      y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+      y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+      y4m_parameters.image_format = buffer->image_format;
+      y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+      file_writer = libgav1::FileWriter::Open(
+          options.output_file_name, options.output_file_type, &y4m_parameters);
+      if (file_writer == nullptr) {
+        fprintf(stderr, "Cannot open output file!\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    if (!limit_reached && file_writer != nullptr &&
+        !file_writer->WriteFrame(*buffer)) {
+      fprintf(stderr, "Error writing output file.\n");
+      return EXIT_FAILURE;
+    }
+    if (options.limit > 0 && options.limit == decoded_frames) {
+      limit_reached = true;
+      if (input_buffer != nullptr) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+      }
+      input_buffer = nullptr;
+      // Clear any in progress frames to ensure the output frame limit is
+      // respected.
+      decoder.SignalEOS();
+    }
+  } while (input_buffer != nullptr ||
+           (!file_reader->IsEndOfFile() && !limit_reached) ||
+           !dequeue_finished);
+  timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+  if (record_frame_timing) {
+    // Note timing for frame parallel will be skewed by the time spent queueing
+    // additional frames and in the output queue waiting for previous frames,
+    // the values reported won't be that meaningful.
+    fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+    for (size_t i = 0; i < frame_timing.size(); ++i) {
+      const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+          frame_timing[i].dequeue - frame_timing[i].enqueue));
+      fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+    }
+  }
+
+  if (options.verbose > 0) {
+    fprintf(stderr, "time to read input: %d us\n",
+            static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+    const int decode_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+    const double decode_fps =
+        (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+    fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+            decode_time_us, decoded_frames, decode_fps);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.cc b/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644
index 0000000..6aa4e61
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+  void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+    std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+    std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+}  // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->OnCVPixelBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->GetCVPixelBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                    void* buffer_private_data) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+      new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+  return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+    : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+  CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+    int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment) {
+  if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+                        image_format != libgav1::kImageFormatMonochrome400)) {
+    fprintf(stderr,
+            "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+            "image_format: %d.\n",
+            bitdepth, image_format);
+    return libgav1::kStatusUnimplemented;
+  }
+
+  // stride_alignment must be a power of 2.
+  assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+  // The possible keys for CVPixelBufferPool are:
+  //   kCVPixelBufferPoolMinimumBufferCountKey
+  //   kCVPixelBufferPoolMaximumBufferAgeKey
+  //   kCVPixelBufferPoolAllocationThresholdKey
+  const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+  const int min_buffer_count = 10;
+  UniqueCFNumberRef cf_min_buffer_count(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+  if (cf_min_buffer_count == nullptr) {
+    fprintf(stderr, "CFNumberCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  const void* pool_values[] = {cf_min_buffer_count.get()};
+  UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+      nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+      &kCFTypeDictionaryValueCallBacks));
+  if (pool_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+  // null and must contain the pixel format, width, and height, otherwise
+  // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+  // (-6682).
+
+  // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+  const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+                               ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+                               : kCVPixelFormatType_OneComponent8;
+  UniqueCFNumberRef cf_pixel_format(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+  UniqueCFNumberRef cf_width(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+  UniqueCFNumberRef cf_height(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+  UniqueCFNumberRef cf_left_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+  UniqueCFNumberRef cf_right_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+  UniqueCFNumberRef cf_top_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+  UniqueCFNumberRef cf_bottom_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+  UniqueCFNumberRef cf_stride_alignment(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPixelFormatTypeKey,
+      kCVPixelBufferWidthKey,
+      kCVPixelBufferHeightKey,
+      kCVPixelBufferExtendedPixelsLeftKey,
+      kCVPixelBufferExtendedPixelsRightKey,
+      kCVPixelBufferExtendedPixelsTopKey,
+      kCVPixelBufferExtendedPixelsBottomKey,
+      kCVPixelBufferBytesPerRowAlignmentKey,
+  };
+  const void* buffer_values[] = {
+      cf_pixel_format.get(),  cf_width.get(),
+      cf_height.get(),        cf_left_border.get(),
+      cf_right_border.get(),  cf_top_border.get(),
+      cf_bottom_border.get(), cf_stride_alignment.get(),
+  };
+  UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (buffer_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  CVPixelBufferPoolRef cv_pool;
+  CVReturn ret = CVPixelBufferPoolCreate(
+      /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+      &cv_pool);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+  CVPixelBufferPoolRelease(pool_);
+  pool_ = cv_pool;
+  return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+    int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+    libgav1::FrameBuffer* frame_buffer) {
+  static_cast<void>(bitdepth);
+  assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+                           image_format == libgav1::kImageFormatMonochrome400));
+  const bool is_monochrome =
+      (image_format == libgav1::kImageFormatMonochrome400);
+
+  // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+  // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+  // kCVReturnWouldExceedAllocationThreshold (-6689).
+  UniqueCFNumberRef cf_num_buffers(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPoolAllocationThresholdKey,
+  };
+  const void* buffer_values[] = {
+      cf_num_buffers.get(),
+  };
+  UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (aux_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  CVPixelBufferRef pixel_buffer;
+  CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+      /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr,
+            "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+
+  ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+            static_cast<int>(ret));
+    CFRelease(pixel_buffer);
+    return libgav1::kStatusUnknownError;
+  }
+
+  // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+  // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+  // CVPixelBufferGetPlaneCount returns 0), but
+  // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+  // still work for plane index 0, even though the documentation says they
+  // return NULL for nonplanar pixel buffers.
+  frame_buffer->stride[0] =
+      static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+  frame_buffer->plane[0] = static_cast<uint8_t*>(
+      CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+  if (is_monochrome) {
+    frame_buffer->stride[1] = 0;
+    frame_buffer->stride[2] = 0;
+    frame_buffer->plane[1] = nullptr;
+    frame_buffer->plane[2] = nullptr;
+  } else {
+    frame_buffer->stride[1] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+    frame_buffer->stride[2] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+    frame_buffer->plane[1] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+    frame_buffer->plane[2] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+  }
+  frame_buffer->private_data = pixel_buffer;
+
+  return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+    void* buffer_private_data) {
+  auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+  CVReturn ret =
+      CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+            __FILE__, __LINE__, static_cast<int>(ret));
+    abort();
+  }
+  CFRelease(pixel_buffer);
+}
diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.h b/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644
index 0000000..7aee324
--- /dev/null
+++ b/examples/gav1_decode_cv_pixel_buffer_pool.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                               void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+  static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+      size_t num_buffers);
+
+  // Not copyable or movable.
+  Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+  Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+      delete;
+
+  ~Gav1DecodeCVPixelBufferPool();
+
+  libgav1::StatusCode OnCVPixelBufferSizeChanged(
+      int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border,
+      int stride_alignment);
+
+  libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+                                       libgav1::ImageFormat image_format,
+                                       int width, int height, int left_border,
+                                       int right_border, int top_border,
+                                       int bottom_border, int stride_alignment,
+                                       libgav1::FrameBuffer* frame_buffer);
+  void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+  Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+  CVPixelBufferPoolRef pool_ = nullptr;
+  const int num_buffers_;
+};
+
+#endif  // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/examples/ivf_parser.cc b/examples/ivf_parser.cc
new file mode 100644
index 0000000..f8adb14
--- /dev/null
+++ b/examples/ivf_parser.cc
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+  size_t value = buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+  size_t value = buffer[3] << 24;
+  value |= buffer[2] << 16;
+  value |= buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+}  // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+                        IvfFileHeader* const ivf_file_header) {
+  if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+  if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+    return false;
+  }
+
+  // Verify header version and length.
+  const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+  if (ivf_header_version != kIvfHeaderVersion) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+  }
+
+  const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+  if (ivf_header_size != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+    return false;
+  }
+
+  if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+      memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+    return false;
+  }
+
+  ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+  ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+  ivf_file_header->frame_rate_numerator =
+      ReadLittleEndian32(&header_buffer[16]);
+  ivf_file_header->frame_rate_denominator =
+      ReadLittleEndian32(&header_buffer[20]);
+
+  return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+                         IvfFrameHeader* const ivf_frame_header) {
+  if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+  ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+  if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+    return false;
+  }
+
+  ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+  const uint64_t timestamp_hi =
+      static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+  ivf_frame_header->timestamp |= timestamp_hi;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/examples/ivf_parser.h b/examples/ivf_parser.h
new file mode 100644
index 0000000..b6bbc59
--- /dev/null
+++ b/examples/ivf_parser.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+  IvfFileHeader() = default;
+  IvfFileHeader(const IvfFileHeader& rhs) = default;
+  IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+  IvfFileHeader(IvfFileHeader&& rhs) = default;
+  IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+  size_t width = 0;
+  size_t height = 0;
+  size_t frame_rate_numerator = 0;
+  size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+  IvfFrameHeader() = default;
+  IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+  IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+  size_t frame_size = 0;
+  int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+                        IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+                         IvfFrameHeader* ivf_frame_header);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake
new file mode 100644
index 0000000..1f949f3
--- /dev/null
+++ b/examples/libgav1_examples.cmake
@@ -0,0 +1,63 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+  return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+                                "${libgav1_examples}/file_reader.h"
+                                "${libgav1_examples}/file_reader_constants.cc"
+                                "${libgav1_examples}/file_reader_constants.h"
+                                "${libgav1_examples}/file_reader_factory.cc"
+                                "${libgav1_examples}/file_reader_factory.h"
+                                "${libgav1_examples}/file_reader_interface.h"
+                                "${libgav1_examples}/ivf_parser.cc"
+                                "${libgav1_examples}/ivf_parser.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+                                "${libgav1_examples}/file_writer.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+  libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+                      ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+                      ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_executable(NAME
+                         gav1_decode
+                         SOURCES
+                         ${libgav1_decode_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         ${libgav1_gtest_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_file_writer
+                         LIB_DEPS
+                         absl::strings
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_dependency})
+endmacro()
diff --git a/examples/logging.h b/examples/logging.h
new file mode 100644
index 0000000..cf5a09f
--- /dev/null
+++ b/examples/logging.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
+  do {                                                                        \
+    constexpr const char* libgav1_examples_basename =                         \
+        libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);          \
+    fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+            __func__, error_string);                                          \
+  } while (false)
+
+#else  // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+  do {                                           \
+  } while (false)
+
+#endif  // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+}  // namespace examples
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc
new file mode 100644
index 0000000..c1a5606
--- /dev/null
+++ b/src/buffer_pool.cc
@@ -0,0 +1,218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+namespace {
+
+// Copies the feature_enabled, feature_data, segment_id_pre_skip, and
+// last_active_segment_id fields of Segmentation.
+void CopySegmentationParameters(const Segmentation& from, Segmentation* to) {
+  memcpy(to->feature_enabled, from.feature_enabled,
+         sizeof(to->feature_enabled));
+  memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data));
+  to->segment_id_pre_skip = from.segment_id_pre_skip;
+  to->last_active_segment_id = from.last_active_segment_id;
+}
+
+}  // namespace
+
+RefCountedBuffer::RefCountedBuffer() = default;
+
+RefCountedBuffer::~RefCountedBuffer() = default;
+
+bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
+                               int height, int subsampling_x, int subsampling_y,
+                               int left_border, int right_border,
+                               int top_border, int bottom_border) {
+  // The YuvBuffer::Realloc() could call the get frame buffer callback which
+  // will need to be thread safe. So we ensure that we only call Realloc() once
+  // at any given time.
+  std::lock_guard<std::mutex> lock(pool_->mutex_);
+  assert(!buffer_private_data_valid_);
+  if (!yuv_buffer_.Realloc(
+          bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+          left_border, right_border, top_border, bottom_border,
+          pool_->get_frame_buffer_, pool_->callback_private_data_,
+          &buffer_private_data_)) {
+    return false;
+  }
+  buffer_private_data_valid_ = true;
+  return true;
+}
+
+bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
+  upscaled_width_ = frame_header.upscaled_width;
+  frame_width_ = frame_header.width;
+  frame_height_ = frame_header.height;
+  render_width_ = frame_header.render_width;
+  render_height_ = frame_header.render_height;
+  rows4x4_ = frame_header.rows4x4;
+  columns4x4_ = frame_header.columns4x4;
+  if (frame_header.refresh_frame_flags != 0 &&
+      !IsIntraFrame(frame_header.frame_type)) {
+    const int rows4x4_half = DivideBy2(rows4x4_);
+    const int columns4x4_half = DivideBy2(columns4x4_);
+    if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+      return false;
+    }
+  }
+  return segmentation_map_.Allocate(rows4x4_, columns4x4_);
+}
+
+void RefCountedBuffer::SetGlobalMotions(
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions) {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    static_assert(sizeof(global_motion_[ref].params) ==
+                      sizeof(global_motions[ref].params),
+                  "");
+    memcpy(global_motion_[ref].params, global_motions[ref].params,
+           sizeof(global_motion_[ref].params));
+  }
+}
+
+void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) {
+  frame_context_ = context;
+  frame_context_.ResetIntraFrameYModeCdf();
+  frame_context_.ResetCounters();
+}
+
+void RefCountedBuffer::GetSegmentationParameters(
+    Segmentation* segmentation) const {
+  CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation);
+}
+
+void RefCountedBuffer::SetSegmentationParameters(
+    const Segmentation& segmentation) {
+  CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_);
+}
+
+void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; }
+
+void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) {
+  ptr->pool_->ReturnUnusedBuffer(ptr);
+}
+
+BufferPool::BufferPool(
+    FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+    GetFrameBufferCallback get_frame_buffer,
+    ReleaseFrameBufferCallback release_frame_buffer,
+    void* callback_private_data) {
+  if (get_frame_buffer != nullptr) {
+    // on_frame_buffer_size_changed may be null.
+    assert(release_frame_buffer != nullptr);
+    on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+    get_frame_buffer_ = get_frame_buffer;
+    release_frame_buffer_ = release_frame_buffer;
+    callback_private_data_ = callback_private_data;
+  } else {
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer_ = GetInternalFrameBuffer;
+    release_frame_buffer_ = ReleaseInternalFrameBuffer;
+    callback_private_data_ = &internal_frame_buffers_;
+  }
+}
+
+BufferPool::~BufferPool() {
+  for (const auto* buffer : buffers_) {
+    if (buffer->in_use_) {
+      assert(false && "RefCountedBuffer still in use at destruction time.");
+      LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
+    }
+    delete buffer;
+  }
+}
+
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+                                          Libgav1ImageFormat image_format,
+                                          int width, int height,
+                                          int left_border, int right_border,
+                                          int top_border, int bottom_border) {
+  if (on_frame_buffer_size_changed_ == nullptr) return true;
+  return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                       image_format, width, height, left_border,
+                                       right_border, top_border, bottom_border,
+                                       /*stride_alignment=*/16) == kStatusOk;
+}
+
+RefCountedBufferPtr BufferPool::GetFreeBuffer() {
+  // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
+  // from the same thread serially, but the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
+  // time. So this function has to be thread safe.
+  // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
+  // need not be thread safe.
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (!buffer->in_use_) {
+      buffer->in_use_ = true;
+      buffer->progress_row_ = -1;
+      buffer->frame_state_ = kFrameStateUnknown;
+      lock.unlock();
+      return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+    }
+  }
+  lock.unlock();
+  auto* const buffer = new (std::nothrow) RefCountedBuffer();
+  if (buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+    return RefCountedBufferPtr();
+  }
+  buffer->SetBufferPool(this);
+  buffer->in_use_ = true;
+  buffer->progress_row_ = -1;
+  buffer->frame_state_ = kFrameStateUnknown;
+  lock.lock();
+  const bool ok = buffers_.push_back(buffer);
+  lock.unlock();
+  if (!ok) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Failed to push the new reference counted buffer into the vector.");
+    delete buffer;
+    return RefCountedBufferPtr();
+  }
+  return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
+
+void BufferPool::Abort() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (buffer->in_use_) {
+      buffer->Abort();
+    }
+  }
+}
+
+void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  assert(buffer->in_use_);
+  buffer->in_use_ = false;
+  if (buffer->buffer_private_data_valid_) {
+    release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+    buffer->buffer_private_data_valid_ = false;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/buffer_pool.h b/src/buffer_pool.h
new file mode 100644
index 0000000..d9eba6d
--- /dev/null
+++ b/src/buffer_pool.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_BUFFER_POOL_H_
+#define LIBGAV1_SRC_BUFFER_POOL_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+class BufferPool;
+
+enum FrameState : uint8_t {
+  kFrameStateUnknown,
+  kFrameStateStarted,
+  kFrameStateParsed,
+  kFrameStateDecoded
+};
+
+// A reference-counted frame buffer. Clients should access it via
+// RefCountedBufferPtr, which manages reference counting transparently.
+// The alignment requirement is due to the SymbolDecoderContext member
+// frame_context_.
+class RefCountedBuffer : public MaxAlignedAllocable {
+ public:
+  // Not copyable or movable.
+  RefCountedBuffer(const RefCountedBuffer&) = delete;
+  RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
+
+  // Allocates the YUV buffer. Returns true on success. Returns false on
+  // failure. This function ensures the thread safety of the |get_frame_buffer_|
+  // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+  // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+  // be thread safe so that we can remove the thread safety of this function and
+  // applications can have fine grained locks.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int subsampling_x, int subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border);
+
+  YuvBuffer* buffer() { return &yuv_buffer_; }
+
+  // Returns the buffer private data set by the get frame buffer callback when
+  // it allocated the YUV buffer.
+  void* buffer_private_data() const {
+    assert(buffer_private_data_valid_);
+    return buffer_private_data_;
+  }
+
+  // NOTE: In the current frame, this is the frame_type syntax element in the
+  // frame header. In a reference frame, this implements the RefFrameType array
+  // in the spec.
+  FrameType frame_type() const { return frame_type_; }
+  void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
+
+  // The sample position for subsampled streams. This is the
+  // chroma_sample_position syntax element in the sequence header.
+  //
+  // NOTE: The decoder does not use chroma_sample_position, but it needs to be
+  // passed on to the client in DecoderBuffer.
+  ChromaSamplePosition chroma_sample_position() const {
+    return chroma_sample_position_;
+  }
+  void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
+    chroma_sample_position_ = chroma_sample_position;
+  }
+
+  // Whether the frame can be used as show existing frame in the future.
+  bool showable_frame() const { return showable_frame_; }
+  void set_showable_frame(bool value) { showable_frame_ = value; }
+
+  // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
+  // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
+  // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+  // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+  // success, false on failure.
+  bool SetFrameDimensions(const ObuFrameHeader& frame_header);
+
+  int32_t upscaled_width() const { return upscaled_width_; }
+  int32_t frame_width() const { return frame_width_; }
+  int32_t frame_height() const { return frame_height_; }
+  // RenderWidth() and RenderHeight() return the render size, which is a hint
+  // to the application about the desired display size.
+  int32_t render_width() const { return render_width_; }
+  int32_t render_height() const { return render_height_; }
+  int32_t rows4x4() const { return rows4x4_; }
+  int32_t columns4x4() const { return columns4x4_; }
+
+  int spatial_id() const { return spatial_id_; }
+  void set_spatial_id(int value) { spatial_id_ = value; }
+  int temporal_id() const { return temporal_id_; }
+  void set_temporal_id(int value) { temporal_id_ = value; }
+
+  SegmentationMap* segmentation_map() { return &segmentation_map_; }
+  const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
+
+  // Only the |params| field of each GlobalMotion struct should be used.
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& GlobalMotions()
+      const {
+    return global_motion_;
+  }
+  // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion
+  // struct is saved.
+  void SetGlobalMotions(
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motions);
+
+  // Returns the saved CDF tables.
+  const SymbolDecoderContext& FrameContext() const { return frame_context_; }
+  // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the
+  // default. The last entry in each table, representing the symbol count for
+  // that context, is set to 0.
+  void SetFrameContext(const SymbolDecoderContext& context);
+
+  const std::array<int8_t, kNumReferenceFrameTypes>& loop_filter_ref_deltas()
+      const {
+    return loop_filter_ref_deltas_;
+  }
+  const std::array<int8_t, kLoopFilterMaxModeDeltas>& loop_filter_mode_deltas()
+      const {
+    return loop_filter_mode_deltas_;
+  }
+  // Saves the ref_deltas and mode_deltas arrays in loop_filter.
+  void SetLoopFilterDeltas(const LoopFilter& loop_filter) {
+    loop_filter_ref_deltas_ = loop_filter.ref_deltas;
+    loop_filter_mode_deltas_ = loop_filter.mode_deltas;
+  }
+
+  // Copies the saved values of the following fields to the Segmentation
+  // struct: feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id. The other fields are left unchanged.
+  void GetSegmentationParameters(Segmentation* segmentation) const;
+  // Saves the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct.
+  void SetSegmentationParameters(const Segmentation& segmentation);
+
+  const FilmGrainParams& film_grain_params() const {
+    return film_grain_params_;
+  }
+  void set_film_grain_params(const FilmGrainParams& params) {
+    film_grain_params_ = params;
+  }
+
+  const ReferenceInfo* reference_info() const { return &reference_info_; }
+  ReferenceInfo* reference_info() { return &reference_info_; }
+
+  // This will wake up the WaitUntil*() functions and make them return false.
+  void Abort() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      abort_ = true;
+    }
+    parsed_condvar_.notify_all();
+    decoded_condvar_.notify_all();
+    progress_row_condvar_.notify_all();
+  }
+
+  void SetFrameState(FrameState frame_state) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      frame_state_ = frame_state;
+    }
+    if (frame_state == kFrameStateParsed) {
+      parsed_condvar_.notify_all();
+    } else if (frame_state == kFrameStateDecoded) {
+      decoded_condvar_.notify_all();
+      progress_row_condvar_.notify_all();
+    }
+  }
+
+  // Sets the progress of this frame to |progress_row| and notifies any threads
+  // that may be waiting on rows <= |progress_row|.
+  void SetProgress(int progress_row) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (progress_row_ >= progress_row) return;
+      progress_row_ = progress_row;
+    }
+    progress_row_condvar_.notify_all();
+  }
+
+  void MarkFrameAsStarted() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (frame_state_ != kFrameStateUnknown) return;
+    frame_state_ = kFrameStateStarted;
+  }
+
+  // All the WaitUntil* functions will return true if the desired wait state was
+  // reached successfully. If the return value is false, then the caller must
+  // assume that the wait was not successful and try to stop whatever they are
+  // doing as early as possible.
+
+  // Waits until the frame has been parsed.
+  bool WaitUntilParsed() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ < kFrameStateParsed && !abort_) {
+      parsed_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+  // Waits until the |progress_row| has been decoded (as indicated either by
+  // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+  // nullptr and will be populated with the value of |progress_row_| after the
+  // wait.
+  //
+  // Typical usage of |progress_row_cache| is as follows:
+  //  * Initialize |*progress_row_cache| to INT_MIN.
+  //  * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+  bool WaitUntil(int progress_row, int* progress_row_cache) {
+    // If |progress_row| is negative, it means that the wait is on the top
+    // border to be available. The top border will be available when row 0 has
+    // been decoded. So we can simply wait on row 0 instead.
+    progress_row = std::max(progress_row, 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+           !abort_) {
+      progress_row_condvar_.wait(lock);
+    }
+    // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+    // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+    // case.
+    *progress_row_cache =
+        (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+    return !abort_;
+  }
+
+  // Waits until the entire frame has been decoded.
+  bool WaitUntilDecoded() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ != kFrameStateDecoded && !abort_) {
+      decoded_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+ private:
+  friend class BufferPool;
+
+  // Methods for BufferPool:
+  RefCountedBuffer();
+  ~RefCountedBuffer();
+  void SetBufferPool(BufferPool* pool);
+  static void ReturnToBufferPool(RefCountedBuffer* ptr);
+
+  BufferPool* pool_ = nullptr;
+  bool buffer_private_data_valid_ = false;
+  void* buffer_private_data_ = nullptr;
+  YuvBuffer yuv_buffer_;
+  bool in_use_ = false;  // Only used by BufferPool.
+
+  std::mutex mutex_;
+  FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+  int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+  // Signaled when progress_row_ is updated or when frame_state_ is set to
+  // kFrameStateDecoded.
+  std::condition_variable progress_row_condvar_;
+  // Signaled when the frame state is set to kFrameStateParsed.
+  std::condition_variable parsed_condvar_;
+  // Signaled when the frame state is set to kFrameStateDecoded.
+  std::condition_variable decoded_condvar_;
+  bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
+
+  FrameType frame_type_ = kFrameKey;
+  ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+  bool showable_frame_ = false;
+
+  int32_t upscaled_width_ = 0;
+  int32_t frame_width_ = 0;
+  int32_t frame_height_ = 0;
+  int32_t render_width_ = 0;
+  int32_t render_height_ = 0;
+  int32_t columns4x4_ = 0;
+  int32_t rows4x4_ = 0;
+  int spatial_id_ = 0;
+  int temporal_id_ = 0;
+
+  // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
+  SegmentationMap segmentation_map_;
+
+  // Only the |params| field of each GlobalMotion struct is used.
+  // global_motion_[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion_ = {};
+  SymbolDecoderContext frame_context_;
+  std::array<int8_t, kNumReferenceFrameTypes> loop_filter_ref_deltas_;
+  std::array<int8_t, kLoopFilterMaxModeDeltas> loop_filter_mode_deltas_;
+  // Only the feature_enabled, feature_data, segment_id_pre_skip, and
+  // last_active_segment_id fields of the Segmentation struct are used.
+  //
+  // Note: The spec only requires that we save feature_enabled and
+  // feature_data. Since segment_id_pre_skip and last_active_segment_id depend
+  // on feature_enabled only, we also save their values as an optimization.
+  Segmentation segmentation_ = {};
+  FilmGrainParams film_grain_params_ = {};
+  ReferenceInfo reference_info_;
+};
+
+// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
+//
+// Note: For simplicity, RefCountedBufferPtr is implemented as a
+// std::shared_ptr<RefCountedBuffer>. This requires a heap allocation of the
+// control block for std::shared_ptr. To avoid that heap allocation, we can
+// add a |ref_count_| field to RefCountedBuffer and implement a custom
+// RefCountedBufferPtr class.
+using RefCountedBufferPtr = std::shared_ptr<RefCountedBuffer>;
+
+// BufferPool maintains a pool of RefCountedBuffers.
+class BufferPool {
+ public:
+  BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+             GetFrameBufferCallback get_frame_buffer,
+             ReleaseFrameBufferCallback release_frame_buffer,
+             void* callback_private_data);
+
+  // Not copyable or movable.
+  BufferPool(const BufferPool&) = delete;
+  BufferPool& operator=(const BufferPool&) = delete;
+
+  ~BufferPool();
+
+  LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+      int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border);
+
+  // Finds a free buffer in the buffer pool and returns a reference to the free
+  // buffer. If there is no free buffer, returns a null pointer. This function
+  // is thread safe.
+  RefCountedBufferPtr GetFreeBuffer();
+
+  // Aborts all the buffers that are in use.
+  void Abort();
+
+ private:
+  friend class RefCountedBuffer;
+
+  // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
+  // only. This function is thread safe.
+  void ReturnUnusedBuffer(RefCountedBuffer* buffer);
+
+  // Used to make the following functions thread safe: GetFreeBuffer(),
+  // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+  std::mutex mutex_;
+
+  // Storing a RefCountedBuffer object in a Vector is complicated because of the
+  // copy/move semantics. So the simplest way around that is to store a list of
+  // pointers in the vector.
+  Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  InternalFrameBufferList internal_frame_buffers_;
+
+  // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+  GetFrameBufferCallback get_frame_buffer_;
+  ReleaseFrameBufferCallback release_frame_buffer_;
+  // Private data associated with the frame buffer callbacks.
+  void* callback_private_data_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_BUFFER_POOL_H_
diff --git a/src/buffer_pool_test.cc b/src/buffer_pool_test.cc
new file mode 100644
index 0000000..abe681e
--- /dev/null
+++ b/src/buffer_pool_test.cc
@@ -0,0 +1,305 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/buffer_pool.h"
+
+#include <climits>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <tuple>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/frame_buffer_utils.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BufferPoolTest, RefCountedBufferPtr) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+  EXPECT_EQ(buffer_ptr.use_count(), 1);
+
+  RefCountedBufferPtr buffer_ptr2 = buffer_ptr;
+  RefCountedBufferPtr buffer_ptr3 = buffer_ptr;
+  EXPECT_EQ(buffer_ptr.use_count(), 3);
+  EXPECT_EQ(buffer_ptr2.use_count(), 3);
+  EXPECT_EQ(buffer_ptr3.use_count(), 3);
+
+  buffer_ptr2 = nullptr;
+  EXPECT_EQ(buffer_ptr.use_count(), 2);
+  EXPECT_EQ(buffer_ptr2.use_count(), 0);
+  EXPECT_EQ(buffer_ptr3.use_count(), 2);
+
+  RefCountedBufferPtr buffer_ptr4 = std::move(buffer_ptr);
+  EXPECT_EQ(buffer_ptr.use_count(), 0);
+  EXPECT_EQ(buffer_ptr2.use_count(), 0);
+  EXPECT_EQ(buffer_ptr3.use_count(), 2);
+  EXPECT_EQ(buffer_ptr4.use_count(), 2);
+}
+
+TEST(RefCountedBufferTest, SetFrameDimensions) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  // Test the undocumented default values of rows4x4() and columns4x4(). (Not
+  // sure if this is a good idea.)
+  EXPECT_EQ(buffer_ptr->rows4x4(), 0);
+  EXPECT_EQ(buffer_ptr->columns4x4(), 0);
+
+  // Test the side effects of SetFrameDimensions().
+  ObuFrameHeader frame_header = {};
+  frame_header.rows4x4 = 20;
+  frame_header.columns4x4 = 30;
+  EXPECT_TRUE(buffer_ptr->SetFrameDimensions(frame_header));
+  EXPECT_EQ(buffer_ptr->rows4x4(), 20);
+  EXPECT_EQ(buffer_ptr->columns4x4(), 30);
+}
+
+TEST(RefCountedBuffertTest, WaitUntil) {
+  InternalFrameBufferList buffer_list;
+  BufferPool buffer_pool(OnInternalFrameBufferSizeChanged,
+                         GetInternalFrameBuffer, ReleaseInternalFrameBuffer,
+                         &buffer_list);
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  int progress_row_cache;
+  buffer_ptr->SetProgress(10);
+  EXPECT_TRUE(buffer_ptr->WaitUntil(5, &progress_row_cache));
+  EXPECT_EQ(progress_row_cache, 10);
+
+  buffer_ptr->SetFrameState(kFrameStateDecoded);
+  EXPECT_TRUE(buffer_ptr->WaitUntil(500, &progress_row_cache));
+  EXPECT_EQ(progress_row_cache, INT_MAX);
+
+  buffer_ptr->Abort();
+  EXPECT_FALSE(buffer_ptr->WaitUntil(50, &progress_row_cache));
+}
+
+constexpr struct Params {
+  int width;
+  int height;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  int border;
+} kParams[] = {
+    {1920, 1080, 1, 1, 96},   //
+    {1920, 1080, 1, 1, 64},   //
+    {1920, 1080, 1, 1, 32},   //
+    {1920, 1080, 1, 1, 160},  //
+    {1920, 1080, 1, 0, 160},  //
+    {1920, 1080, 0, 0, 160},  //
+};
+
+std::ostream& operator<<(std::ostream& os, const Params& param) {
+  return os << param.width << "x" << param.height
+            << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+            << "/" << static_cast<int>(param.subsampling_y)
+            << ", border: " << param.border;
+}
+
+class RefCountedBufferReallocTest
+    : public testing::TestWithParam<std::tuple<bool, Params>> {
+ protected:
+  const bool use_external_callbacks_ = std::get<0>(GetParam());
+  const Params& param_ = std::get<1>(GetParam());
+};
+
+TEST_P(RefCountedBufferReallocTest, 8Bit) {
+  InternalFrameBufferList buffer_list;
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  void* callback_private_data = nullptr;
+  if (use_external_callbacks_) {
+    on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer = GetInternalFrameBuffer;
+    release_frame_buffer = ReleaseInternalFrameBuffer;
+    callback_private_data = &buffer_list;
+  }
+
+  BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+                         release_frame_buffer, callback_private_data);
+
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  const Libgav1ImageFormat image_format = ComposeImageFormat(
+      /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+  EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+      /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+      param_.border, param_.border, param_.border));
+
+  EXPECT_TRUE(buffer_ptr->Realloc(
+      /*bitdepth=*/8, /*is_monochrome=*/false, param_.width, param_.height,
+      param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+      param_.border, param_.border));
+
+  // The first row of each plane is aligned at 16-byte boundaries.
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+  // Subsequent rows are aligned at 16-byte boundaries.
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+  // Check the borders.
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+
+  // Write to the upper-left corner of the border.
+  uint8_t* y_buffer = buffer_ptr->buffer()->data(kPlaneY);
+  int y_stride = buffer_ptr->buffer()->stride(kPlaneY);
+  y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+           buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+  // Write to the lower-right corner of the border.
+  uint8_t* v_buffer = buffer_ptr->buffer()->data(kPlaneV);
+  int v_stride = buffer_ptr->buffer()->stride(kPlaneV);
+  v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+            buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+               v_stride +
+           buffer_ptr->buffer()->width(kPlaneV) +
+           buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(RefCountedBufferReallocTest, 10Bit) {
+  InternalFrameBufferList buffer_list;
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  void* callback_private_data = nullptr;
+  if (use_external_callbacks_) {
+    on_frame_buffer_size_changed = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer = GetInternalFrameBuffer;
+    release_frame_buffer = ReleaseInternalFrameBuffer;
+    callback_private_data = &buffer_list;
+  }
+
+  BufferPool buffer_pool(on_frame_buffer_size_changed, get_frame_buffer,
+                         release_frame_buffer, callback_private_data);
+
+  RefCountedBufferPtr buffer_ptr = buffer_pool.GetFreeBuffer();
+  EXPECT_NE(buffer_ptr, nullptr);
+
+  const Libgav1ImageFormat image_format = ComposeImageFormat(
+      /*is_monochrome=*/false, param_.subsampling_x, param_.subsampling_y);
+  EXPECT_TRUE(buffer_pool.OnFrameBufferSizeChanged(
+      /*bitdepth=*/8, image_format, param_.width, param_.height, param_.border,
+      param_.border, param_.border, param_.border));
+
+  EXPECT_TRUE(buffer_ptr->Realloc(
+      /*bitdepth=*/10, /*is_monochrome=*/false, param_.width, param_.height,
+      param_.subsampling_x, param_.subsampling_y, param_.border, param_.border,
+      param_.border, param_.border));
+
+  // The first row of each plane is aligned at 16-byte boundaries.
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneY)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneU)) % 16, 0);
+  EXPECT_EQ(
+      reinterpret_cast<uintptr_t>(buffer_ptr->buffer()->data(kPlaneV)) % 16, 0);
+
+  // Subsequent rows are aligned at 16-byte boundaries.
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneY) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneU) % 16, 0);
+  EXPECT_EQ(buffer_ptr->buffer()->stride(kPlaneV) % 16, 0);
+
+  // Check the borders.
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneY), param_.border);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneU),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneU),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->left_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->right_border(kPlaneV),
+            param_.border >> param_.subsampling_x);
+  EXPECT_EQ(buffer_ptr->buffer()->top_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+  EXPECT_EQ(buffer_ptr->buffer()->bottom_border(kPlaneV),
+            param_.border >> param_.subsampling_y);
+
+  // Write to the upper-left corner of the border.
+  auto* y_buffer =
+      reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneY));
+  int y_stride = buffer_ptr->buffer()->stride(kPlaneY) / sizeof(uint16_t);
+  y_buffer[-buffer_ptr->buffer()->left_border(kPlaneY) -
+           buffer_ptr->buffer()->top_border(kPlaneY) * y_stride] = 0;
+  // Write to the lower-right corner of the border.
+  auto* v_buffer =
+      reinterpret_cast<uint16_t*>(buffer_ptr->buffer()->data(kPlaneV));
+  int v_stride = buffer_ptr->buffer()->stride(kPlaneV) / sizeof(uint16_t);
+  v_buffer[(buffer_ptr->buffer()->height(kPlaneV) +
+            buffer_ptr->buffer()->bottom_border(kPlaneV) - 1) *
+               v_stride +
+           buffer_ptr->buffer()->width(kPlaneV) +
+           buffer_ptr->buffer()->right_border(kPlaneV) - 1] = 0;
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(
+    Default, RefCountedBufferReallocTest,
+    testing::Combine(testing::Bool(),  // use_external_callbacks
+                     testing::ValuesIn(kParams)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/c_decoder_test.c b/src/c_decoder_test.c
new file mode 100644
index 0000000..10ef29f
--- /dev/null
+++ b/src/c_decoder_test.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/decoder.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b)                                                      \
+  do {                                                                       \
+    if ((a) != (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C DecoderTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_NE(a, b)                                                      \
+  do {                                                                       \
+    if ((a) == (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C DecoderTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_TRUE(a)                                                   \
+  do {                                                                   \
+    if (!(a)) {                                                          \
+      fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                 \
+      fprintf(stderr, "C DecoderTest failed\n");                         \
+      exit(1);                                                           \
+    }                                                                    \
+  } while (0)
+
+#define ASSERT_FALSE(a)                                                     \
+  do {                                                                      \
+    if (a) {                                                                \
+      fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                    \
+      fprintf(stderr, "C DecoderTest failed\n");                            \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (0)
+
+// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf
+static const uint8_t kFrame1[] = {
+    0x12, 0x0,  0xa,  0xa,  0x0,  0x0,  0x0,  0x2,  0x27, 0xfe, 0xff, 0xfc,
+    0xc0, 0x20, 0x32, 0x93, 0x2,  0x10, 0x0,  0xa8, 0x80, 0x0,  0x3,  0x0,
+    0x10, 0x10, 0x30, 0x0,  0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82,
+    0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1,  0x5,  0x0,  0xb3, 0xde,
+    0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b,
+    0x8b, 0x0,  0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63,
+    0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72,
+    0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4,  0x18, 0x5e,
+    0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8,
+    0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa,  0xf9, 0x75, 0xd0,
+    0x62, 0x69, 0xd,  0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8,  0x9e, 0x33, 0x15,
+    0xa3, 0xe1, 0x42, 0x0,  0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc,
+    0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb,
+    0x82, 0x87, 0x71, 0x0,  0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc,
+    0xcd, 0xe7, 0x12, 0xb9, 0xe,  0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90,
+    0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1,
+    0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3,
+    0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb,
+    0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7,  0x12, 0xf3, 0x11,
+    0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6,  0x5a, 0x1,  0x37,
+    0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb,  0x80, 0xde,
+    0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89,
+    0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51,
+    0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4,
+    0xaf, 0xd6, 0x6b, 0x38};
+
+static const uint8_t kFrame2[] = {
+    0x12, 0x0,  0x32, 0x33, 0x30, 0x3,  0xc3, 0x0,  0xa7, 0x2e, 0x46,
+    0xa8, 0x80, 0x0,  0x3,  0x0,  0x10, 0x1,  0x0,  0xa0, 0x0,  0xed,
+    0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3,  0x26, 0x35, 0xeb, 0x5a,
+    0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7,  0x49, 0x76,
+    0xa3, 0xc7, 0x62, 0xf8, 0x3,  0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80};
+
+typedef struct DecoderTest {
+  Libgav1Decoder* decoder;
+  int frames_in_use;
+  void* buffer_private_data;
+  void* released_input_buffer;
+} DecoderTest;
+
+static void DecoderTestInit(DecoderTest* test) {
+  test->decoder = NULL;
+  test->frames_in_use = 0;
+  test->buffer_private_data = NULL;
+  test->released_input_buffer = NULL;
+}
+
+static void DecoderTestIncrementFramesInUse(DecoderTest* test) {
+  ++test->frames_in_use;
+}
+
+static void DecoderTestDecrementFramesInUse(DecoderTest* test) {
+  --test->frames_in_use;
+}
+
+static void DecoderTestSetReleasedInputBuffer(DecoderTest* test,
+                                              void* released_input_buffer) {
+  test->released_input_buffer = released_input_buffer;
+}
+
+static void DecoderTestSetBufferPrivateData(DecoderTest* test,
+                                            void* buffer_private_data) {
+  test->buffer_private_data = buffer_private_data;
+}
+
+typedef struct FrameBufferPrivate {
+  uint8_t* data[3];
+} FrameBufferPrivate;
+
+static Libgav1StatusCode GetFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  Libgav1FrameBufferInfo info;
+  Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kLibgav1StatusOk) return status;
+
+  FrameBufferPrivate* buffer_private =
+      (FrameBufferPrivate*)malloc(sizeof(FrameBufferPrivate));
+  if (buffer_private == NULL) return kLibgav1StatusOutOfMemory;
+
+  for (int i = 0; i < 3; ++i) {
+    const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+    buffer_private->data[i] = (uint8_t*)malloc(sizeof(uint8_t) * size);
+    if (buffer_private->data[i] == NULL) {
+      for (int j = 0; j < i; j++) {
+        free(buffer_private->data[j]);
+      }
+      free(buffer_private);
+      return kLibgav1StatusOutOfMemory;
+    }
+  }
+
+  uint8_t* const y_buffer = buffer_private->data[0];
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[1] : NULL;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[2] : NULL;
+
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+                                 buffer_private, frame_buffer);
+  if (status != kLibgav1StatusOk) return status;
+
+  DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+  DecoderTestIncrementFramesInUse(decoder_test);
+  DecoderTestSetBufferPrivateData(decoder_test, frame_buffer->private_data);
+  return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+                               void* buffer_private_data) {
+  FrameBufferPrivate* buffer_private = (FrameBufferPrivate*)buffer_private_data;
+  for (int i = 0; i < 3; ++i) {
+    free(buffer_private->data[i]);
+  }
+  free(buffer_private);
+  DecoderTest* const decoder_test = (DecoderTest*)callback_private_data;
+  DecoderTestDecrementFramesInUse(decoder_test);
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+  DecoderTestSetReleasedInputBuffer((DecoderTest*)private_data, input_buffer);
+}
+
+static void DecoderTestSetUp(DecoderTest* test) {
+  Libgav1DecoderSettings settings;
+  Libgav1DecoderSettingsInitDefault(&settings);
+  settings.frame_parallel = 0;  // false
+  settings.get_frame_buffer = GetFrameBuffer;
+  settings.release_frame_buffer = ReleaseFrameBuffer;
+  settings.callback_private_data = test;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  ASSERT_EQ(test->decoder, NULL);
+  ASSERT_EQ(Libgav1DecoderCreate(&settings, &test->decoder), kLibgav1StatusOk);
+  ASSERT_NE(test->decoder, NULL);
+}
+
+static void DecoderTestAPIFlowForNonFrameParallelMode(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+  // So there should be no frames in use yet.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  // libgav1 has decoded frame1 and is holding a reference to it.
+  ASSERT_EQ(test.frames_in_use, 1);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Dequeue the output of frame2.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 2);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Signal end of stream (method 1). This should ensure that all the references
+  // are released.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+
+  // libgav1 should have released all the reference frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence.
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Dequeue the output of frame2.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 2);
+  ASSERT_EQ(test.buffer_private_data, buffer->buffer_private_data);
+
+  // Signal end of stream (method 2). This should ensure that all the references
+  // are released.
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+
+  // libgav1 should have released all the frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void
+DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // Until the output of frame1 is dequeued, no other frames can be enqueued.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusTryAgain);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Delete the decoder instance.
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+
+  ASSERT_EQ(test.frames_in_use, 0);
+}
+
+static void DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Signal end of stream before dequeuing the output of frame2.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  // In this case, the output of the last frame that was enqueued is lost (which
+  // is intentional since end of stream was signaled without dequeueing it).
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+}
+
+static void DecoderTestNonFrameParallelModeInvalidFrameAfterEOS(void) {
+  DecoderTest test;
+  DecoderTestInit(&test);
+  DecoderTestSetUp(&test);
+
+  Libgav1StatusCode status;
+  const Libgav1DecoderBuffer* buffer = NULL;
+
+  // Enqueue frame1 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame1, sizeof(kFrame1), 0,
+                                      (uint8_t*)&kFrame1);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame1.
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+  ASSERT_NE(buffer, NULL);
+  ASSERT_EQ(test.released_input_buffer, &kFrame1);
+
+  ASSERT_EQ(test.frames_in_use, 1);
+
+  // Signal end of stream.
+  status = Libgav1DecoderSignalEOS(test.decoder);
+
+  // libgav1 should have released all the reference frames now.
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence. But, we
+  // try to enqueue a frame that does not have a sequence header (which is not
+  // allowed).
+
+  // Enqueue frame2 for decoding.
+  status = Libgav1DecoderEnqueueFrame(test.decoder, kFrame2, sizeof(kFrame2), 0,
+                                      (uint8_t*)&kFrame2);
+  ASSERT_EQ(status, kLibgav1StatusOk);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  // Dequeue the output of frame2 (this will fail since no sequence header has
+  // been seen since the last EOS signal).
+  status = Libgav1DecoderDequeueFrame(test.decoder, &buffer);
+  ASSERT_EQ(status, kLibgav1StatusBitstreamError);
+  ASSERT_EQ(test.released_input_buffer, &kFrame2);
+
+  ASSERT_EQ(test.frames_in_use, 0);
+
+  Libgav1DecoderDestroy(test.decoder);
+  test.decoder = NULL;
+}
+
+int main(void) {
+  fprintf(stderr, "C DecoderTest started\n");
+  DecoderTestAPIFlowForNonFrameParallelMode();
+  DecoderTestNonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing();
+  DecoderTestNonFrameParallelModeEOSBeforeDequeuingLastFrame();
+  DecoderTestNonFrameParallelModeInvalidFrameAfterEOS();
+  fprintf(stderr, "C DecoderTest passed\n");
+  return 0;
+}
diff --git a/src/c_version_test.c b/src/c_version_test.c
new file mode 100644
index 0000000..e198ee7
--- /dev/null
+++ b/src/c_version_test.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __cplusplus
+#error Do not compile this file with a C++ compiler
+#endif
+
+// clang-format off
+#include "src/gav1/version.h"
+// clang-format on
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define ASSERT_EQ(a, b)                                                      \
+  do {                                                                       \
+    if ((a) != (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) == (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C VersionTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_NE(a, b)                                                      \
+  do {                                                                       \
+    if ((a) == (b)) {                                                        \
+      fprintf(stderr, "Assertion failure: (%s) != (%s), at %s:%d\n", #a, #b, \
+              __FILE__, __LINE__);                                           \
+      fprintf(stderr, "C VersionTest failed\n");                             \
+      exit(1);                                                               \
+    }                                                                        \
+  } while (0)
+
+#define ASSERT_TRUE(a)                                                   \
+  do {                                                                   \
+    if (!(a)) {                                                          \
+      fprintf(stderr, "Assertion failure: %s, at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                 \
+      fprintf(stderr, "C VersionTest failed\n");                         \
+      exit(1);                                                           \
+    }                                                                    \
+  } while (0)
+
+#define ASSERT_FALSE(a)                                                     \
+  do {                                                                      \
+    if (a) {                                                                \
+      fprintf(stderr, "Assertion failure: !(%s), at %s:%d\n", #a, __FILE__, \
+              __LINE__);                                                    \
+      fprintf(stderr, "C VersionTest failed\n");                            \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (0)
+
+static void VersionTestGetVersion(void) {
+  const int library_version = Libgav1GetVersion();
+  ASSERT_EQ((library_version >> 24) & 0xff, 0);
+  // Note if we link against a shared object there's potential for a mismatch
+  // if a different library is loaded at runtime.
+  ASSERT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  ASSERT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  ASSERT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+  const int header_version = LIBGAV1_VERSION;
+  ASSERT_EQ((header_version >> 24) & 0xff, 0);
+  ASSERT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  ASSERT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  ASSERT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+static void VersionTestGetVersionString(void) {
+  const char* version = Libgav1GetVersionString();
+  ASSERT_NE(version, NULL);
+}
+
+static void VersionTestGetBuildConfiguration(void) {
+  const char* config = Libgav1GetBuildConfiguration();
+  ASSERT_NE(config, NULL);
+}
+
+int main(void) {
+  fprintf(stderr, "C VersionTest started\n");
+  VersionTestGetVersion();
+  VersionTestGetVersionString();
+  VersionTestGetBuildConfiguration();
+  fprintf(stderr, "C VersionTest passed\n");
+  return 0;
+}
diff --git a/src/decoder.cc b/src/decoder.cc
new file mode 100644
index 0000000..b9e43e0
--- /dev/null
+++ b/src/decoder.cc
@@ -0,0 +1,119 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
+
+#include "src/decoder_impl.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+                                       Libgav1Decoder** decoder_out) {
+  std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+                                                    libgav1::Decoder());
+  if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+  libgav1::DecoderSettings cxx_settings;
+  cxx_settings.threads = settings->threads;
+  cxx_settings.frame_parallel = settings->frame_parallel != 0;
+  cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+  cxx_settings.on_frame_buffer_size_changed =
+      settings->on_frame_buffer_size_changed;
+  cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+  cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+  cxx_settings.release_input_buffer = settings->release_input_buffer;
+  cxx_settings.callback_private_data = settings->callback_private_data;
+  cxx_settings.output_all_layers = settings->output_all_layers != 0;
+  cxx_settings.operating_point = settings->operating_point;
+  cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+  const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+  if (status == kLibgav1StatusOk) {
+    *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+  }
+  return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+                                             const uint8_t* data, size_t size,
+                                             int64_t user_private_data,
+                                             void* buffer_private_data) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+                                   buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+  return libgav1::Decoder::GetMaxBitdepth();
+}
+
+}  // extern "C"
+
+namespace libgav1 {
+
+Decoder::Decoder() = default;
+
+Decoder::~Decoder() = default;
+
+StatusCode Decoder::Init(const DecoderSettings* const settings) {
+  if (impl_ != nullptr) return kStatusAlready;
+  if (settings != nullptr) settings_ = *settings;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
+                                 int64_t user_private_data,
+                                 void* buffer_private_data) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->EnqueueFrame(data, size, user_private_data,
+                             buffer_private_data);
+}
+
+StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->DequeueFrame(out_ptr);
+}
+
+StatusCode Decoder::SignalEOS() {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  // In non-frame-parallel mode, we have to release all the references. This
+  // simply means replacing the |impl_| with a new instance so that all the
+  // existing references are released and the state is cleared.
+  impl_ = nullptr;
+  return DecoderImpl::Create(&settings_, &impl_);
+}
+
+// static.
+int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); }
+
+}  // namespace libgav1
diff --git a/src/decoder_buffer_test.cc b/src/decoder_buffer_test.cc
new file mode 100644
index 0000000..b1d8bb8
--- /dev/null
+++ b/src/decoder_buffer_test.cc
@@ -0,0 +1,38 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_buffer.h"
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+// Tests the emulation of C++ enumerators by constexpr constants.
+TEST(DecoderBufferTest, EnumTest) {
+  ColorRange color_range = kLibgav1ColorRangeFull;
+
+  // Verify that we get the -Wswitch warning unless the switch statement
+  // handles both kColorRangeStudio and kColorRangeFull:
+  //   enumeration value 'kLibgav1ColorRangeFull' not handled in switch
+  switch (color_range) {
+    case kColorRangeStudio:
+      break;
+    case kColorRangeFull:
+      break;
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc
new file mode 100644
index 0000000..dbb9e81
--- /dev/null
+++ b/src/decoder_impl.cc
@@ -0,0 +1,1698 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/decoder_impl.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iterator>
+#include <new>
+#include <utility>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/prediction_mask.h"
+#include "src/threading_strategy.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+                          const bool do_superres, const int subsampling_y) {
+  int extra_border = 0;
+  if (do_cdef) {
+    extra_border += kCdefBorder;
+  } else if (do_restoration) {
+    // If CDEF is enabled, loop restoration is safe without extra border.
+    extra_border += kRestorationVerticalBorder;
+  }
+  if (do_superres) extra_border += kSuperResVerticalBorder;
+  // Double the number of extra bottom border pixels if the bottom border will
+  // be subsampled.
+  extra_border <<= subsampling_y;
+  return Align(kBorderPixels + extra_border, 2);  // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+                            int count) {
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    frame_scratch_buffer->tile_decoding_failed = true;
+  }
+  std::condition_variable* const condvars =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  for (int i = 0; i < count; ++i) {
+    condvars[i].notify_one();
+  }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
+ public:
+  FrameScratchBufferReleaser(
+      FrameScratchBufferPool* frame_scratch_buffer_pool,
+      std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+      : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+        frame_scratch_buffer_(frame_scratch_buffer) {}
+  ~FrameScratchBufferReleaser() {
+    frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+  }
+
+ private:
+  FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+  std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
+};
+
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+                        const SegmentationMap* prev_segment_ids,
+                        RefCountedBuffer* const frame) {
+  if (!frame_header.segmentation.enabled) {
+    // All segment_id's are 0.
+    frame->segmentation_map()->Clear();
+  } else if (!frame_header.segmentation.update_map) {
+    // Copy from prev_segment_ids.
+    if (prev_segment_ids == nullptr) {
+      // Treat a null prev_segment_ids pointer as if it pointed to a
+      // segmentation map containing all 0s.
+      frame->segmentation_map()->Clear();
+    } else {
+      frame->segmentation_map()->CopyFrom(*prev_segment_ids);
+    }
+  }
+}
+
+StatusCode DecodeTilesNonFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter) {
+  // Decode in superblock row order.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+              row4x4, tile_scratch_buffer.get())) {
+        return kLibgav1StatusUnknownError;
+      }
+    }
+    post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+  }
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter,
+    BlockingCounterWithStatus* const pending_tiles) {
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  const int num_workers = threading_strategy.tile_thread_count();
+  BlockingCounterWithStatus pending_workers(num_workers);
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  bool tile_decoding_failed = false;
+  // Submit tile decoding jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+                                                     &tile_counter,
+                                                     &pending_workers,
+                                                     &pending_tiles]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->ParseAndDecode()) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        } else {
+          pending_tiles->Decrement(false);
+        }
+      }
+      pending_workers.Decrement(!failed);
+    });
+  }
+  // Have the current thread partake in tile decoding.
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!tile_decoding_failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->ParseAndDecode()) {
+        LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+        tile_decoding_failed = true;
+      }
+    } else {
+      pending_tiles->Decrement(false);
+    }
+  }
+  // Wait until all the workers are done. This ensures that all the tiles have
+  // been parsed.
+  tile_decoding_failed |= !pending_workers.Wait();
+  // Wait until all the tiles have been decoded.
+  tile_decoding_failed |= !pending_tiles->Wait();
+  if (tile_decoding_failed) return kStatusUnknownError;
+  assert(threading_strategy.post_filter_thread_pool() != nullptr);
+  post_filter->ApplyFilteringThreaded();
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  for (const auto& tile : tiles) {
+    if (!tile->Parse()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+      return kStatusUnknownError;
+    }
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  // Mark frame as parsed.
+  current_frame->SetFrameState(kFrameStateParsed);
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) {
+    return kStatusOutOfMemory;
+  }
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  // Decode in superblock row order (inter prediction in the Tile class will
+  // block until the required superblocks in the reference frame are decoded).
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+              row4x4, tile_scratch_buffer.get())) {
+        LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+                     tile_ptr->number());
+        return kStatusUnknownError;
+      }
+    }
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Mark frame as decoded (we no longer care about row-level progress since the
+  // entire frame has been decoded).
+  current_frame->SetFrameState(kFrameStateDecoded);
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+    PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+    const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+    int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+  // Apply vertical deblock filtering for the first 64 columns of each tile.
+  for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+    const Tile& tile = *tile_row_base[tile_column];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+  }
+  if (decode_entire_tiles_in_worker_threads &&
+      row4x4 == tile_row_base[0]->row4x4_start()) {
+    // This is the first superblock row of a tile row. In this case, apply
+    // horizontal deblock filtering for the entire superblock row.
+    post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+                                    frame_header.columns4x4, block_width4x4);
+  } else {
+    // Apply horizontal deblock filtering for the first 64 columns of the
+    // first tile.
+    const Tile& first_tile = *tile_row_base[0];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+        first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // previous tile and the first 64 columns of the current tile.
+    for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+      const Tile& tile = *tile_row_base[tile_column];
+      // If the previous tile has more than 64 columns, then include those
+      // for the horizontal deblock.
+      const Tile& previous_tile = *tile_row_base[tile_column - 1];
+      const int column4x4_start =
+          tile.column4x4_start() -
+          ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+            previous_tile.column4x4_start())
+               ? kNum4x4InLoopFilterUnit
+               : 0);
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    }
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // last tile.
+    const Tile& last_tile = *tile_row_base[tile_columns - 1];
+    // Identify the last column4x4 value and do horizontal filtering for
+    // that column4x4. The value of last column4x4 is the nearest multiple
+    // of 16 that is before tile.column4x4_end().
+    const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+    // If column4x4_start is the same as tile.column4x4_start() then it
+    // means that the last tile has <= 64 columns. So there is nothing left
+    // to deblock (since it was already deblocked in the loop above).
+    if (column4x4_start != last_tile.column4x4_start()) {
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          last_tile.column4x4_end(), block_width4x4);
+    }
+  }
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+//   * Schedule the next superblock row in the current tile column for decoding
+//     (the next superblock row may be in a different tile than the current
+//     one).
+//   * If an entire superblock row of the frame has been decoded, it notifies
+//     the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+    const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+    const int superblock_size4x4, const int tile_columns,
+    const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (scratch_buffer == nullptr) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  Tile& tile = *tiles[tile_index];
+  const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+      row4x4, scratch_buffer.get());
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(scratch_buffer));
+  if (!ok) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  if (post_filter->DoDeblock()) {
+    // Apply vertical deblock filtering for all the columns in this tile except
+    // for the first 64 columns.
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+        superblock_size4x4);
+    // Apply horizontal deblock filtering for all the columns in this tile
+    // except for the first and the last 64 columns.
+    // Note about the last tile of each row: For the last tile, column4x4_end
+    // may not be a multiple of 16. In that case it is still okay to simply
+    // subtract 16 since ApplyDeblockFilter() will only do the filters in
+    // increments of 64 columns (or 32 columns for chroma with subsampling).
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+        tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+  }
+  const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+  const int index = row4x4 >> superblock_size4x4_log2;
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  bool notify;
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    notify = ++superblock_row_progress[index] == tile_columns;
+  }
+  if (notify) {
+    // We are done decoding this superblock row. Notify the post filtering
+    // thread.
+    superblock_row_progress_condvar[index].notify_one();
+  }
+  // Schedule the next superblock row (if one exists).
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  const int next_row4x4 = row4x4 + superblock_size4x4;
+  if (!tile.IsRow4x4Inside(next_row4x4)) {
+    tile_index += tile_columns;
+  }
+  if (tile_index >= tiles.size()) return;
+  pending_jobs->IncrementBy(1);
+  thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+                        tile_columns, superblock_rows, frame_scratch_buffer,
+                        post_filter, pending_jobs]() {
+    DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+                              superblock_size4x4, tile_columns, superblock_rows,
+                              frame_scratch_buffer, post_filter, pending_jobs);
+    pending_jobs->Decrement();
+  });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  const int num_workers = thread_pool.num_threads();
+  BlockingCounterWithStatus parse_workers(num_workers);
+  // Submit tile parsing jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Parse()) {
+            LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        }
+      }
+      parse_workers.Decrement(!failed);
+    });
+  }
+
+  // Have the current thread participate in parsing.
+  bool failed = false;
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->Parse()) {
+        LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+        failed = true;
+      }
+    }
+  }
+
+  // Wait until all the parse workers are done. This ensures that all the tiles
+  // have been parsed.
+  if (!parse_workers.Wait() || failed) {
+    return kLibgav1StatusUnknownError;
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  current_frame->SetFrameState(kFrameStateParsed);
+
+  // Decode the frame.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header.use_128x128_superblock ? 5 : 4;
+  const int superblock_rows =
+      (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+  if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+      !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+          superblock_rows)) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  memset(superblock_row_progress, 0,
+         superblock_rows * sizeof(superblock_row_progress[0]));
+  frame_scratch_buffer->tile_decoding_failed = false;
+  const int tile_columns = frame_header.tile_info.tile_columns;
+  const bool decode_entire_tiles_in_worker_threads =
+      num_workers >= tile_columns;
+  BlockingCounter pending_jobs(
+      decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+  if (decode_entire_tiles_in_worker_threads) {
+    // Submit tile decoding jobs to the thread pool.
+    tile_counter = 0;
+    for (int i = 0; i < num_workers; ++i) {
+      thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+                            frame_scratch_buffer, superblock_rows]() {
+        bool failed = false;
+        int index;
+        while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+               tile_count) {
+          if (failed) continue;
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Decode(
+                  &frame_scratch_buffer->superblock_row_mutex,
+                  frame_scratch_buffer->superblock_row_progress.get(),
+                  frame_scratch_buffer->superblock_row_progress_condvar
+                      .get())) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+            SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+          }
+        }
+        pending_jobs.Decrement();
+      });
+    }
+  } else {
+    // Schedule the jobs for first tile row.
+    for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+      thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+                            superblock_rows, frame_scratch_buffer, post_filter,
+                            &pending_jobs]() {
+        DecodeSuperBlockRowInTile(
+            tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+            frame_scratch_buffer, post_filter, &pending_jobs);
+        pending_jobs.Decrement();
+      });
+    }
+  }
+
+  // Current thread will do the post filters.
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+  for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4, ++index) {
+    if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+      tile_row_base += tile_columns;
+    }
+    {
+      std::unique_lock<std::mutex> lock(
+          frame_scratch_buffer->superblock_row_mutex);
+      while (superblock_row_progress[index] != tile_columns &&
+             !frame_scratch_buffer->tile_decoding_failed) {
+        superblock_row_progress_condvar[index].wait(lock);
+      }
+      if (frame_scratch_buffer->tile_decoding_failed) break;
+    }
+    if (post_filter->DoDeblock()) {
+      // Apply deblocking filter for the tile boundaries of this superblock row.
+      // The deblocking filter for the internal blocks will be applied in the
+      // tile worker threads. In this thread, we will only have to apply
+      // deblocking filter for the tile boundaries.
+      ApplyDeblockingFilterForTileBoundaries(
+          post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+          tile_columns, decode_entire_tiles_in_worker_threads);
+    }
+    // Apply all the post filters other than deblocking.
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/false);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Wait until all the pending jobs are done. This ensures that all the tiles
+  // have been decoded and wrapped up.
+  pending_jobs.Wait();
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    if (frame_scratch_buffer->tile_decoding_failed) {
+      return kLibgav1StatusUnknownError;
+    }
+  }
+
+  current_frame->SetFrameState(kFrameStateDecoded);
+  return kStatusOk;
+}
+
+}  // namespace
+
+// static
+StatusCode DecoderImpl::Create(const DecoderSettings* settings,
+                               std::unique_ptr<DecoderImpl>* output) {
+  if (settings->threads <= 0) {
+    LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
+    return kStatusInvalidArgument;
+  }
+  if (settings->frame_parallel) {
+    if (settings->release_input_buffer == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "release_input_buffer callback must not be null when "
+                   "frame_parallel is true.");
+      return kStatusInvalidArgument;
+    }
+  }
+  std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
+  if (impl == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
+    return kStatusOutOfMemory;
+  }
+  const StatusCode status = impl->Init();
+  if (status != kStatusOk) return status;
+  *output = std::move(impl);
+  return kStatusOk;
+}
+
+DecoderImpl::DecoderImpl(const DecoderSettings* settings)
+    : buffer_pool_(settings->on_frame_buffer_size_changed,
+                   settings->get_frame_buffer, settings->release_frame_buffer,
+                   settings->callback_private_data),
+      settings_(*settings) {
+  dsp::DspInit();
+}
+
+DecoderImpl::~DecoderImpl() {
+  // Clean up and wait until all the threads have stopped. We just have to pass
+  // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+  // path that clears all the threads and structs.
+  SignalFailure(kStatusUnknownError);
+  // Release any other frame buffer references that we may be holding on to.
+  ReleaseOutputFrame();
+  output_frame_queue_.Clear();
+  for (auto& reference_frame : state_.reference_frame) {
+    reference_frame = nullptr;
+  }
+}
+
+StatusCode DecoderImpl::Init() {
+  if (!output_frame_queue_.Init(kMaxLayers)) {
+    LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+    const uint8_t* data, size_t size) {
+  is_frame_parallel_ = false;
+  if (settings_.frame_parallel) {
+    DecoderState state;
+    std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+        data, size, settings_.operating_point, &buffer_pool_, &state));
+    if (obu == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+      return kStatusOutOfMemory;
+    }
+    RefCountedBufferPtr current_frame;
+    const StatusCode status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    current_frame = nullptr;
+    // We assume that the first frame that was parsed will contain the frame
+    // header. This assumption is usually true in practice. So we will simply
+    // not use frame parallel mode if this is not the case.
+    if (settings_.threads > 1 &&
+        !InitializeThreadPoolsForFrameParallel(
+            settings_.threads, obu->frame_header().tile_info.tile_count,
+            obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+            &frame_scratch_buffer_pool_)) {
+      return kStatusOutOfMemory;
+    }
+  }
+  const int max_allowed_frames =
+      (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+  assert(max_allowed_frames > 0);
+  if (!temporal_units_.Init(max_allowed_frames)) {
+    LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  is_frame_parallel_ = frame_thread_pool_ != nullptr;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
+                                     int64_t user_private_data,
+                                     void* buffer_private_data) {
+  if (data == nullptr || size == 0) return kStatusInvalidArgument;
+  if (HasFailure()) return kStatusUnknownError;
+  if (!seen_first_frame_) {
+    seen_first_frame_ = true;
+    const StatusCode status =
+        InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+    if (status != kStatusOk) {
+      return SignalFailure(status);
+    }
+  }
+  if (temporal_units_.Full()) {
+    return kStatusTryAgain;
+  }
+  if (is_frame_parallel_) {
+    return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+  }
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  temporal_units_.Push(std::move(temporal_unit));
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+  if (status == kStatusOk || status == kStatusTryAgain) return status;
+  // Set the |failure_status_| first so that any pending jobs in
+  // |frame_thread_pool_| will exit right away when the thread pool is being
+  // released below.
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    failure_status_ = status;
+  }
+  // Make sure all waiting threads exit.
+  buffer_pool_.Abort();
+  frame_thread_pool_ = nullptr;
+  while (!temporal_units_.Empty()) {
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(
+          settings_.callback_private_data,
+          temporal_units_.Front().buffer_private_data);
+    }
+    temporal_units_.Pop();
+  }
+  return status;
+}
+
+// DequeueFrame() follows the following policy to avoid holding unnecessary
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
+StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
+  if (out_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
+    return kStatusInvalidArgument;
+  }
+  // We assume a call to DequeueFrame() indicates that the caller is no longer
+  // using the previous output frame, so we can release it.
+  ReleaseOutputFrame();
+  if (temporal_units_.Empty()) {
+    // No input frames to decode.
+    *out_ptr = nullptr;
+    return kStatusNothingToDequeue;
+  }
+  TemporalUnit& temporal_unit = temporal_units_.Front();
+  if (!is_frame_parallel_) {
+    // If |output_frame_queue_| is not empty, then return the first frame from
+    // that queue.
+    if (!output_frame_queue_.Empty()) {
+      RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+      output_frame_queue_.Pop();
+      buffer_.user_private_data = temporal_unit.user_private_data;
+      if (output_frame_queue_.Empty()) {
+        temporal_units_.Pop();
+      }
+      const StatusCode status = CopyFrameToOutputBuffer(frame);
+      if (status != kStatusOk) {
+        return status;
+      }
+      *out_ptr = &buffer_;
+      return kStatusOk;
+    }
+    // Decode the next available temporal unit and return.
+    const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+    if (status != kStatusOk) {
+      // In case of failure, discard all the output frames that we may be
+      // holding on references to.
+      output_frame_queue_.Clear();
+    }
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(settings_.callback_private_data,
+                                     temporal_unit.buffer_private_data);
+    }
+    if (output_frame_queue_.Empty()) {
+      temporal_units_.Pop();
+    }
+    return status;
+  }
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (settings_.blocking_dequeue) {
+      while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        decoded_condvar_.wait(lock);
+      }
+    } else {
+      if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        return kStatusTryAgain;
+      }
+    }
+    if (failure_status_ != kStatusOk) {
+      const StatusCode failure_status = failure_status_;
+      lock.unlock();
+      return SignalFailure(failure_status);
+    }
+  }
+  if (settings_.release_input_buffer != nullptr &&
+      !temporal_unit.released_input_buffer) {
+    temporal_unit.released_input_buffer = true;
+    settings_.release_input_buffer(settings_.callback_private_data,
+                                   temporal_unit.buffer_private_data);
+  }
+  if (temporal_unit.status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(temporal_unit.status);
+  }
+  if (!temporal_unit.has_displayable_frame) {
+    *out_ptr = nullptr;
+    temporal_units_.Pop();
+    return kStatusOk;
+  }
+  assert(temporal_unit.output_layer_count > 0);
+  StatusCode status = CopyFrameToOutputBuffer(
+      temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+      nullptr;
+  if (status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(status);
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  if (--temporal_unit.output_layer_count == 0) {
+    temporal_units_.Pop();
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+                                         int64_t user_private_data,
+                                         void* buffer_private_data) {
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  int position_in_temporal_unit = 0;
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      // TODO(vigneshv): This may not be the right place to call this callback
+      // for the frame parallel case. Investigate and fix it.
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    // This can happen when there are multiple spatial/temporal layers and if
+    // all the layers are outside the current operating point.
+    if (current_frame == nullptr) {
+      continue;
+    }
+    // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+    // in the code below after |temporal_unit| is std::move'd into the
+    // |temporal_units_| queue.
+    if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+                                           position_in_temporal_unit++)) {
+      LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+      return kStatusOutOfMemory;
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+  }
+  // This function cannot fail after this point. So it is okay to move the
+  // |temporal_unit| into |temporal_units_| queue.
+  temporal_units_.Push(std::move(temporal_unit));
+  if (temporal_units_.Back().frames.empty()) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    temporal_units_.Back().has_displayable_frame = false;
+    temporal_units_.Back().decoded = true;
+    return kStatusOk;
+  }
+  for (auto& frame : temporal_units_.Back().frames) {
+    EncodedFrame* const encoded_frame = &frame;
+    encoded_frame->temporal_unit = &temporal_units_.Back();
+    frame_thread_pool_->Schedule([this, encoded_frame]() {
+      if (HasFailure()) return;
+      const StatusCode status = DecodeFrame(encoded_frame);
+      encoded_frame->state = {};
+      encoded_frame->frame = nullptr;
+      TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (failure_status_ != kStatusOk) return;
+      // temporal_unit's status defaults to kStatusOk. So we need to set it only
+      // on error. If |failure_status_| is not kStatusOk at this point, it means
+      // that there has already been a failure. So we don't care about this
+      // subsequent failure.  We will simply return the error code of the first
+      // failure.
+      if (status != kStatusOk) {
+        temporal_unit.status = status;
+        if (failure_status_ == kStatusOk) {
+          failure_status_ = status;
+        }
+      }
+      temporal_unit.decoded =
+          ++temporal_unit.decoded_count == temporal_unit.frames.size();
+      if (temporal_unit.decoded && settings_.output_all_layers &&
+          temporal_unit.output_layer_count > 1) {
+        std::sort(
+            temporal_unit.output_layers,
+            temporal_unit.output_layers + temporal_unit.output_layer_count);
+      }
+      if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+        decoded_condvar_.notify_one();
+      }
+    });
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+  const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+  const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+  RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  StatusCode status;
+  if (!frame_header.show_existing_frame) {
+    if (encoded_frame->tile_buffers.empty()) {
+      // This means that the last call to ParseOneFrame() did not actually
+      // have any tile groups. This could happen in rare cases (for example,
+      // if there is a Metadata OBU after the TileGroup OBU). We currently do
+      // not have a reason to handle those cases, so we simply continue.
+      return kStatusOk;
+    }
+    status = DecodeTiles(sequence_header, frame_header,
+                         encoded_frame->tile_buffers, encoded_frame->state,
+                         frame_scratch_buffer.get(), current_frame.get());
+    if (status != kStatusOk) {
+      return status;
+    }
+  } else {
+    if (!current_frame->WaitUntilDecoded()) {
+      return kStatusUnknownError;
+    }
+  }
+  if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+    // This frame is not displayable. Not an error.
+    return kStatusOk;
+  }
+  RefCountedBufferPtr film_grain_frame;
+  status = ApplyFilmGrain(
+      sequence_header, frame_header, current_frame, &film_grain_frame,
+      frame_scratch_buffer->threading_strategy.thread_pool());
+  if (status != kStatusOk) {
+    return status;
+  }
+
+  TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+    assert(temporal_unit.output_frame_position >= 0);
+    // A displayable frame was already found in this temporal unit. This can
+    // happen if there are multiple spatial/temporal layers. Since
+    // |settings_.output_all_layers| is false, we will output only the last
+    // displayable frame.
+    if (temporal_unit.output_frame_position >
+        encoded_frame->position_in_temporal_unit) {
+      return kStatusOk;
+    }
+    // Replace any output frame that we may have seen before with the current
+    // frame.
+    assert(temporal_unit.output_layer_count == 1);
+    --temporal_unit.output_layer_count;
+  }
+  temporal_unit.has_displayable_frame = true;
+  temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+      std::move(film_grain_frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count]
+      .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+  ++temporal_unit.output_layer_count;
+  temporal_unit.output_frame_position =
+      encoded_frame->position_in_temporal_unit;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                           const DecoderBuffer** out_ptr) {
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    if (!obu->frame_header().show_existing_frame) {
+      if (obu->tile_buffers().empty()) {
+        // This means that the last call to ParseOneFrame() did not actually
+        // have any tile groups. This could happen in rare cases (for example,
+        // if there is a Metadata OBU after the TileGroup OBU). We currently do
+        // not have a reason to handle those cases, so we simply continue.
+        continue;
+      }
+      status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+                           obu->tile_buffers(), state_,
+                           frame_scratch_buffer.get(), current_frame.get());
+      if (status != kStatusOk) {
+        return status;
+      }
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+    if (obu->frame_header().show_frame ||
+        obu->frame_header().show_existing_frame) {
+      if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+        // There is more than one displayable frame in the current operating
+        // point and |settings_.output_all_layers| is false. In this case, we
+        // simply return the last displayable frame as the output frame and
+        // ignore the rest.
+        assert(output_frame_queue_.Size() == 1);
+        output_frame_queue_.Pop();
+      }
+      RefCountedBufferPtr film_grain_frame;
+      status = ApplyFilmGrain(
+          obu->sequence_header(), obu->frame_header(), current_frame,
+          &film_grain_frame,
+          frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+      if (status != kStatusOk) return status;
+      output_frame_queue_.Push(std::move(film_grain_frame));
+    }
+  }
+  if (output_frame_queue_.Empty()) {
+    // No displayable frame in the temporal unit. Not an error.
+    *out_ptr = nullptr;
+    return kStatusOk;
+  }
+  status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+  output_frame_queue_.Pop();
+  if (status != kStatusOk) {
+    return status;
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::CopyFrameToOutputBuffer(
+    const RefCountedBufferPtr& frame) {
+  YuvBuffer* yuv_buffer = frame->buffer();
+
+  buffer_.chroma_sample_position = frame->chroma_sample_position();
+
+  if (yuv_buffer->is_monochrome()) {
+    buffer_.image_format = kImageFormatMonochrome400;
+  } else {
+    if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv444;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 0) {
+      buffer_.image_format = kImageFormatYuv422;
+    } else if (yuv_buffer->subsampling_x() == 1 &&
+               yuv_buffer->subsampling_y() == 1) {
+      buffer_.image_format = kImageFormatYuv420;
+    } else {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid chroma subsampling values: cannot determine buffer "
+                   "image format.");
+      return kStatusInvalidArgument;
+    }
+  }
+  buffer_.color_range = sequence_header_.color_config.color_range;
+  buffer_.color_primary = sequence_header_.color_config.color_primary;
+  buffer_.transfer_characteristics =
+      sequence_header_.color_config.transfer_characteristics;
+  buffer_.matrix_coefficients =
+      sequence_header_.color_config.matrix_coefficients;
+
+  buffer_.bitdepth = yuv_buffer->bitdepth();
+  const int num_planes =
+      yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
+  int plane = kPlaneY;
+  for (; plane < num_planes; ++plane) {
+    buffer_.stride[plane] = yuv_buffer->stride(plane);
+    buffer_.plane[plane] = yuv_buffer->data(plane);
+    buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+    buffer_.displayed_height[plane] = yuv_buffer->height(plane);
+  }
+  for (; plane < kMaxPlanes; ++plane) {
+    buffer_.stride[plane] = 0;
+    buffer_.plane[plane] = nullptr;
+    buffer_.displayed_width[plane] = 0;
+    buffer_.displayed_height[plane] = 0;
+  }
+  buffer_.spatial_id = frame->spatial_id();
+  buffer_.temporal_id = frame->temporal_id();
+  buffer_.buffer_private_data = frame->buffer_private_data();
+  output_frame_ = frame;
+  return kStatusOk;
+}
+
+void DecoderImpl::ReleaseOutputFrame() {
+  for (auto& plane : buffer_.plane) {
+    plane = nullptr;
+  }
+  output_frame_ = nullptr;
+}
+
+StatusCode DecoderImpl::DecodeTiles(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+    const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+    RefCountedBuffer* const current_frame) {
+  frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+      sequence_header.color_config.bitdepth);
+  if (!frame_scratch_buffer->loop_restoration_info.Reset(
+          &frame_header.loop_restoration, frame_header.upscaled_width,
+          frame_header.height, sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.is_monochrome)) {
+    LIBGAV1_DLOG(ERROR,
+                 "Failed to allocate memory for loop restoration info units.");
+    return kStatusOutOfMemory;
+  }
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  if (!is_frame_parallel_ &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
+    return kStatusOutOfMemory;
+  }
+  const bool do_cdef =
+      PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  const bool do_restoration = PostFilter::DoRestoration(
+      frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+  const bool do_superres =
+      PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+  // Use kBorderPixels for the left, right, and top borders. Only the bottom
+  // border may need to be bigger. Cdef border is needed only if we apply Cdef
+  // without multithreading.
+  const int bottom_border = GetBottomBorderPixels(
+      do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+      do_restoration, do_superres, sequence_header.color_config.subsampling_y);
+  current_frame->set_chroma_sample_position(
+      sequence_header.color_config.chroma_sample_position);
+  if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+                              sequence_header.color_config.is_monochrome,
+                              frame_header.upscaled_width, frame_header.height,
+                              sequence_header.color_config.subsampling_x,
+                              sequence_header.color_config.subsampling_y,
+                              /*left_border=*/kBorderPixels,
+                              /*right_border=*/kBorderPixels,
+                              /*top_border=*/kBorderPixels, bottom_border)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
+    return kStatusOutOfMemory;
+  }
+  if (frame_header.cdef.bits > 0) {
+    if (!frame_scratch_buffer->cdef_index.Reset(
+            DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
+      return kStatusOutOfMemory;
+    }
+  }
+  if (do_cdef) {
+    if (!frame_scratch_buffer->cdef_skip.Reset(
+            DivideBy2(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/true)) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef skip.");
+      return kStatusOutOfMemory;
+    }
+  }
+  if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          /*zero_initialize=*/false)) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
+    return kStatusOutOfMemory;
+  }
+  if (frame_header.use_ref_frame_mvs) {
+    if (!frame_scratch_buffer->motion_field.mv.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false) ||
+        !frame_scratch_buffer->motion_field.reference_offset.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to allocate memory for temporal motion vectors.");
+      return kStatusOutOfMemory;
+    }
+
+    // For each motion vector, only mv[0] needs to be initialized to
+    // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+    // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+    // The following memory initialization of contiguous memory is very fast. It
+    // is not recommended to make the initialization multi-threaded, unless the
+    // memory which needs to be initialized in each thread is still contiguous.
+    MotionVector invalid_mv;
+    invalid_mv.mv[0] = kInvalidMvValue;
+    invalid_mv.mv[1] = 0;
+    MotionVector* const motion_field_mv =
+        &frame_scratch_buffer->motion_field.mv[0][0];
+    std::fill(motion_field_mv,
+              motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+              invalid_mv);
+  }
+
+  // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
+  // that the block parameters cache can be filled in for the last row/column
+  // without having to check for boundary conditions.
+  if (!frame_scratch_buffer->block_parameters_holder.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4)) {
+    return kStatusOutOfMemory;
+  }
+  const dsp::Dsp* const dsp =
+      dsp::GetDspTable(sequence_header.color_config.bitdepth);
+  if (dsp == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
+                 sequence_header.color_config.bitdepth);
+    return kStatusInternalError;
+  }
+
+  const int tile_count = frame_header.tile_info.tile_count;
+  assert(tile_count >= 1);
+  Vector<std::unique_ptr<Tile>> tiles;
+  if (!tiles.reserve(tile_count)) {
+    LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
+    return kStatusOutOfMemory;
+  }
+
+  if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+    if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+      frame_scratch_buffer->residual_buffer_pool.reset(
+          new (std::nothrow) ResidualBufferPool(
+              sequence_header.use_128x128_superblock,
+              sequence_header.color_config.subsampling_x,
+              sequence_header.color_config.subsampling_y,
+              sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                         : sizeof(int32_t)));
+      if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
+        return kStatusOutOfMemory;
+      }
+    } else {
+      frame_scratch_buffer->residual_buffer_pool->Reset(
+          sequence_header.use_128x128_superblock,
+          sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                     : sizeof(int32_t));
+    }
+  }
+
+  if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->cdef_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_restoration &&
+      (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->loop_restoration_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            frame_header.upscaled_width, num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_superres) {
+    const int pixel_size = sequence_header.color_config.bitdepth == 8
+                               ? sizeof(uint8_t)
+                               : sizeof(uint16_t);
+    const int coefficients_size = kSuperResFilterTaps *
+                                  Align(frame_header.upscaled_width, 16) *
+                                  pixel_size;
+    if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+            coefficients_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeY].");
+      return kStatusOutOfMemory;
+    }
+#if LIBGAV1_MSAN
+    // Quiet SuperRes_NEON() msan warnings.
+    memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(), 0,
+           coefficients_size);
+#endif
+    const int uv_coefficients_size =
+        kSuperResFilterTaps *
+        Align(SubsampledValue(frame_header.upscaled_width, 1), 16) * pixel_size;
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0 &&
+        !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+            uv_coefficients_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+      return kStatusOutOfMemory;
+    }
+#if LIBGAV1_MSAN
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0) {
+      // Quiet SuperRes_NEON() msan warnings.
+      memset(frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].get(), 0,
+             uv_coefficients_size);
+    }
+#endif
+  }
+
+  if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
+    const int num_threads =
+        threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_threads| rows of the
+    // down-scaled pixels.
+    // Left and right borders are for line extension. They are doubled for the Y
+    // plane to make sure the U and V planes have enough space after possible
+    // subsampling.
+    if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_threads,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+            2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+            nullptr, nullptr, nullptr)) {
+      LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
+    // We can parse the current frame if all the reference frames have been
+    // parsed.
+    for (const int index : frame_header.reference_frame_index) {
+      if (!state.reference_frame[index]->WaitUntilParsed()) {
+        return kStatusUnknownError;
+      }
+    }
+  }
+
+  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+  // a segmentation map containing all 0s.
+  const SegmentationMap* prev_segment_ids = nullptr;
+  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+    frame_scratch_buffer->symbol_decoder_context.Initialize(
+        frame_header.quantizer.base_index);
+  } else {
+    const int index =
+        frame_header
+            .reference_frame_index[frame_header.primary_reference_frame];
+    assert(index != -1);
+    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+    if (frame_header.segmentation.enabled &&
+        prev_frame->columns4x4() == frame_header.columns4x4 &&
+        prev_frame->rows4x4() == frame_header.rows4x4) {
+      prev_segment_ids = prev_frame->segmentation_map();
+    }
+  }
+
+  // The Tile class must make use of a separate buffer to store the unfiltered
+  // pixels for the intra prediction of the next superblock row. This is done
+  // only when one of the following conditions are true:
+  //   * is_frame_parallel_ is true.
+  //   * settings_.threads == 1.
+  // In the non-frame-parallel multi-threaded case, we do not run the post
+  // filters in the decode loop. So this buffer need not be used.
+  const bool use_intra_prediction_buffer =
+      is_frame_parallel_ || settings_.threads == 1;
+  if (use_intra_prediction_buffer) {
+    if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+            frame_header.tile_info.tile_rows)) {
+      LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+      return kStatusOutOfMemory;
+    }
+    IntraPredictionBuffer* const intra_prediction_buffers =
+        frame_scratch_buffer->intra_prediction_buffers.get();
+    for (int plane = kPlaneY; plane < num_planes; ++plane) {
+      const int subsampling =
+          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+      const size_t intra_prediction_buffer_size =
+          ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+           (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                       : sizeof(uint16_t)));
+      for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+           ++tile_row) {
+        if (!intra_prediction_buffers[tile_row][plane].Resize(
+                intra_prediction_buffer_size)) {
+          LIBGAV1_DLOG(ERROR,
+                       "Failed to allocate intra prediction buffer for tile "
+                       "row %d plane %d.\n",
+                       tile_row, plane);
+          return kStatusOutOfMemory;
+        }
+      }
+    }
+  }
+
+  PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+                         current_frame->buffer(), dsp,
+                         settings_.post_filter_mask);
+  SymbolDecoderContext saved_symbol_decoder_context;
+  BlockingCounterWithStatus pending_tiles(tile_count);
+  for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+    std::unique_ptr<Tile> tile = Tile::Create(
+        tile_number, tile_buffers[tile_number].data,
+        tile_buffers[tile_number].size, sequence_header, frame_header,
+        current_frame, state, frame_scratch_buffer, wedge_masks_,
+        quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+        &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+        &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
+    if (tile == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+      return kStatusOutOfMemory;
+    }
+    tiles.push_back_unchecked(std::move(tile));
+  }
+  assert(tiles.size() == static_cast<size_t>(tile_count));
+  if (is_frame_parallel_) {
+    if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+      return DecodeTilesFrameParallel(
+          sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+          prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+    }
+    return DecodeTilesThreadedFrameParallel(
+        sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+        prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+  }
+  StatusCode status;
+  if (settings_.threads == 1) {
+    status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+                                         frame_scratch_buffer, &post_filter);
+  } else {
+    status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+                                                 &post_filter, &pending_tiles);
+  }
+  if (status != kStatusOk) return status;
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::ApplyFilmGrain(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const RefCountedBufferPtr& displayable_frame,
+    RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+  if (!sequence_header.film_grain_params_present ||
+      !displayable_frame->film_grain_params().apply_grain ||
+      (settings_.post_filter_mask & 0x10) == 0) {
+    *film_grain_frame = displayable_frame;
+    return kStatusOk;
+  }
+  if (!frame_header.show_existing_frame &&
+      frame_header.refresh_frame_flags == 0) {
+    // If show_existing_frame is true, then the current frame is a previously
+    // saved reference frame. If refresh_frame_flags is nonzero, then the
+    // state_.UpdateReferenceFrames() call above has saved the current frame as
+    // a reference frame. Therefore, if both of these conditions are false, then
+    // the current frame is not saved as a reference frame. displayable_frame
+    // should hold the only reference to the current frame.
+    assert(displayable_frame.use_count() == 1);
+    // Add film grain noise in place.
+    *film_grain_frame = displayable_frame;
+  } else {
+    *film_grain_frame = buffer_pool_.GetFreeBuffer();
+    if (*film_grain_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "Could not get film_grain_frame from the buffer pool.");
+      return kStatusResourceExhausted;
+    }
+    if (!(*film_grain_frame)
+             ->Realloc(displayable_frame->buffer()->bitdepth(),
+                       displayable_frame->buffer()->is_monochrome(),
+                       displayable_frame->upscaled_width(),
+                       displayable_frame->frame_height(),
+                       displayable_frame->buffer()->subsampling_x(),
+                       displayable_frame->buffer()->subsampling_y(),
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+      LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+      return kStatusOutOfMemory;
+    }
+    (*film_grain_frame)
+        ->set_chroma_sample_position(
+            displayable_frame->chroma_sample_position());
+    (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+    (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+  }
+  const bool color_matrix_is_identity =
+      sequence_header.color_config.matrix_coefficients ==
+      kMatrixCoefficientsIdentity;
+  assert(displayable_frame->buffer()->stride(kPlaneU) ==
+         displayable_frame->buffer()->stride(kPlaneV));
+  const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+  assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+         (*film_grain_frame)->buffer()->stride(kPlaneV));
+  const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (displayable_frame->buffer()->bitdepth() > 8) {
+    FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+                          displayable_frame->buffer()->is_monochrome(),
+                          color_matrix_is_identity,
+                          displayable_frame->buffer()->subsampling_x(),
+                          displayable_frame->buffer()->subsampling_y(),
+                          displayable_frame->upscaled_width(),
+                          displayable_frame->frame_height(), thread_pool);
+  if (!film_grain.AddNoise(
+          displayable_frame->buffer()->data(kPlaneY),
+          displayable_frame->buffer()->stride(kPlaneY),
+          displayable_frame->buffer()->data(kPlaneU),
+          displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+          (*film_grain_frame)->buffer()->data(kPlaneY),
+          (*film_grain_frame)->buffer()->stride(kPlaneY),
+          (*film_grain_frame)->buffer()->data(kPlaneU),
+          (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+    LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+  if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+                   [](const ObuHeader& obu_header) {
+                     return obu_header.type == kObuSequenceHeader;
+                   }) == obu.obu_headers().end()) {
+    return false;
+  }
+  const ObuSequenceHeader sequence_header = obu.sequence_header();
+  const bool sequence_header_changed =
+      !has_sequence_header_ ||
+      sequence_header_.color_config.bitdepth !=
+          sequence_header.color_config.bitdepth ||
+      sequence_header_.color_config.is_monochrome !=
+          sequence_header.color_config.is_monochrome ||
+      sequence_header_.color_config.subsampling_x !=
+          sequence_header.color_config.subsampling_x ||
+      sequence_header_.color_config.subsampling_y !=
+          sequence_header.color_config.subsampling_y ||
+      sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+      sequence_header_.max_frame_height != sequence_header.max_frame_height;
+  sequence_header_ = sequence_header;
+  has_sequence_header_ = true;
+  return sequence_header_changed;
+}
+
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+  if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+    return true;
+  }
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    return false;
+  }
+  wedge_masks_initialized_ = true;
+  return true;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+    const ObuFrameHeader& frame_header) {
+  if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+    return true;
+  }
+  if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+    return false;
+  }
+  quantizer_matrix_initialized_ = true;
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/decoder_impl.h b/src/decoder_impl.h
new file mode 100644
index 0000000..b52ecdf
--- /dev/null
+++ b/src/decoder_impl.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_IMPL_H_
+#define LIBGAV1_SRC_DECODER_IMPL_H_
+
+#include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/constants.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
+#include "src/obu_parser.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct TemporalUnit;
+
+struct EncodedFrame {
+  EncodedFrame(ObuParser* const obu, const DecoderState& state,
+               const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+      : sequence_header(obu->sequence_header()),
+        frame_header(obu->frame_header()),
+        state(state),
+        temporal_unit(nullptr),
+        frame(frame),
+        position_in_temporal_unit(position_in_temporal_unit) {
+    obu->MoveTileBuffers(&tile_buffers);
+    frame->MarkFrameAsStarted();
+  }
+
+  const ObuSequenceHeader sequence_header;
+  const ObuFrameHeader frame_header;
+  Vector<TileBuffer> tile_buffers;
+  DecoderState state;
+  TemporalUnit* temporal_unit;
+  RefCountedBufferPtr frame;
+  const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+  // The default constructor is invoked by the Queue<TemporalUnit>::Init()
+  // method. Queue<> does not use the default-constructed elements, so it is
+  // safe for the default constructor to not initialize the members.
+  TemporalUnit() = default;
+  TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+               void* buffer_private_data)
+      : data(data),
+        size(size),
+        user_private_data(user_private_data),
+        buffer_private_data(buffer_private_data),
+        decoded(false),
+        status(kStatusOk),
+        has_displayable_frame(false),
+        output_frame_position(-1),
+        decoded_count(0),
+        output_layer_count(0),
+        released_input_buffer(false) {}
+
+  const uint8_t* data;
+  size_t size;
+  int64_t user_private_data;
+  void* buffer_private_data;
+
+  // The following members are used only in frame parallel mode.
+  bool decoded;
+  StatusCode status;
+  bool has_displayable_frame;
+  int output_frame_position;
+
+  Vector<EncodedFrame> frames;
+  size_t decoded_count;
+
+  // The struct (and the counter) is used to support output of multiple layers
+  // within a single temporal unit. The decoding process will store the output
+  // frames in |output_layers| in the order they are finished decoding. At the
+  // end of the decoding process, this array will be sorted in reverse order of
+  // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+  // reverse order (so that the entire process can run with a single counter
+  // variable).
+  struct OutputLayer {
+    // Used by std::sort to sort |output_layers| in reverse order of
+    // |position_in_temporal_unit|.
+    bool operator<(const OutputLayer& rhs) const {
+      return position_in_temporal_unit > rhs.position_in_temporal_unit;
+    }
+
+    RefCountedBufferPtr frame;
+    int position_in_temporal_unit = 0;
+  } output_layers[kMaxLayers];
+  // Number of entries in |output_layers|.
+  int output_layer_count;
+  // Flag to ensure that we release the input buffer only once if there are
+  // multiple output layers.
+  bool released_input_buffer;
+};
+
+class DecoderImpl : public Allocable {
+ public:
+  // The constructor saves a const reference to |*settings|. Therefore
+  // |*settings| must outlive the DecoderImpl object. On success, |*output|
+  // contains a pointer to the newly-created DecoderImpl object. On failure,
+  // |*output| is not modified.
+  static StatusCode Create(const DecoderSettings* settings,
+                           std::unique_ptr<DecoderImpl>* output);
+  ~DecoderImpl();
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+  static constexpr int GetMaxBitdepth() {
+    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
+                  "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+    return LIBGAV1_MAX_BITDEPTH;
+  }
+
+ private:
+  explicit DecoderImpl(const DecoderSettings* settings);
+  StatusCode Init();
+  // Called when the first frame is enqueued. It does the OBU parsing for one
+  // temporal unit to retrieve the tile configuration and sets up the frame
+  // threading if frame parallel mode is allowed. It also initializes the
+  // |temporal_units_| queue based on the number of frame threads.
+  //
+  // The following are the limitations of the current implementation:
+  //  * It assumes that all frames in the video have the same tile
+  //    configuration. The frame parallel threading model will not be updated
+  //    based on tile configuration changes mid-stream.
+  //  * The above assumption holds true even when there is a new coded video
+  //    sequence (i.e.) a new sequence header.
+  StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+                                                           size_t size);
+  // Used only in frame parallel mode. Signals failure and waits until the
+  // worker threads are aborted if |status| is a failure status. If |status| is
+  // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+  // Always returns the input parameter |status| as the return value.
+  //
+  // This function is called only from the application thread (from
+  // EnqueueFrame() and DequeueFrame()).
+  StatusCode SignalFailure(StatusCode status);
+
+  void ReleaseOutputFrame();
+
+  // Decodes all the frames contained in the given temporal unit. Used only in
+  // non frame parallel mode.
+  StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                const DecoderBuffer** out_ptr);
+  // Used only in frame parallel mode. Does the OBU parsing for |data| and
+  // schedules the individual frames for decoding in the |frame_thread_pool_|.
+  StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+                              int64_t user_private_data,
+                              void* buffer_private_data);
+  // Decodes the |encoded_frame| and updates the
+  // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+  // displayable frame. Used only in frame parallel mode.
+  StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+  // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+  // in |output_frame_|.
+  StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
+  StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+                         const ObuFrameHeader& frame_header,
+                         const Vector<TileBuffer>& tile_buffers,
+                         const DecoderState& state,
+                         FrameScratchBuffer* frame_scratch_buffer,
+                         RefCountedBuffer* current_frame);
+  // Applies film grain synthesis to the |displayable_frame| and stores the film
+  // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+  StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+                            const ObuFrameHeader& frame_header,
+                            const RefCountedBufferPtr& displayable_frame,
+                            RefCountedBufferPtr* film_grain_frame,
+                            ThreadPool* thread_pool);
+
+  bool IsNewSequenceHeader(const ObuParser& obu);
+
+  bool HasFailure() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return failure_status_ != kStatusOk;
+  }
+
+  // Initializes the |quantizer_matrix_| if necessary and sets
+  // |quantizer_matrix_initialized_| to true.
+  bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+  // Allocates and generates the |wedge_masks_| if necessary and sets
+  // |wedge_masks_initialized_| to true.
+  bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
+  // Elements in this queue cannot be moved with std::move since the
+  // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+  Queue<TemporalUnit> temporal_units_;
+  DecoderState state_;
+
+  DecoderBuffer buffer_ = {};
+  // |output_frame_| holds a reference to the output frame on behalf of
+  // |buffer_|.
+  RefCountedBufferPtr output_frame_;
+
+  // Queue of output frames that are to be returned in the DequeueFrame() calls.
+  // If |settings_.output_all_layers| is false, this queue will never contain
+  // more than 1 element. This queue is used only when |is_frame_parallel_| is
+  // false.
+  Queue<RefCountedBufferPtr> output_frame_queue_;
+
+  BufferPool buffer_pool_;
+  WedgeMaskArray wedge_masks_;
+  bool wedge_masks_initialized_ = false;
+  QuantizerMatrix quantizer_matrix_;
+  bool quantizer_matrix_initialized_ = false;
+  FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+  // Used to synchronize the accesses into |temporal_units_| in order to update
+  // the "decoded" state of an temporal unit.
+  std::mutex mutex_;
+  std::condition_variable decoded_condvar_;
+  bool is_frame_parallel_;
+  std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+  // In frame parallel mode, there are two primary points of failure:
+  //  1) ParseAndSchedule()
+  //  2) DecodeTiles()
+  // Both of these functions have to respond to the other one failing by
+  // aborting whatever they are doing. This variable is used to accomplish that.
+  // If |failure_status_| is not kStatusOk, then the two functions will try to
+  // abort as early as they can.
+  StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+  ObuSequenceHeader sequence_header_ = {};
+  // If true, sequence_header is valid.
+  bool has_sequence_header_ = false;
+
+  const DecoderSettings& settings_;
+  bool seen_first_frame_ = false;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_IMPL_H_
diff --git a/src/decoder_settings.cc b/src/decoder_settings.cc
new file mode 100644
index 0000000..9399073
--- /dev/null
+++ b/src/decoder_settings.cc
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+  settings->threads = 1;
+  settings->frame_parallel = 0;    // false
+  settings->blocking_dequeue = 0;  // false
+  settings->on_frame_buffer_size_changed = nullptr;
+  settings->get_frame_buffer = nullptr;
+  settings->release_frame_buffer = nullptr;
+  settings->release_input_buffer = nullptr;
+  settings->callback_private_data = nullptr;
+  settings->output_all_layers = 0;  // false
+  settings->operating_point = 0;
+  settings->post_filter_mask = 0x1f;
+}
+
+}  // extern "C"
diff --git a/src/decoder_state.h b/src/decoder_state.h
new file mode 100644
index 0000000..ea5c792
--- /dev/null
+++ b/src/decoder_state.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+  // Section 7.20. Updates frames in the reference_frame array with
+  // |current_frame|, based on the |refresh_frame_flags| bitmask.
+  void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+                             int refresh_frame_flags) {
+    for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+         ++ref_index, mask >>= 1) {
+      if ((mask & 1) != 0) {
+        reference_frame_id[ref_index] = current_frame_id;
+        reference_frame[ref_index] = current_frame;
+        reference_order_hint[ref_index] = order_hint;
+      }
+    }
+  }
+
+  // Clears all the reference frames.
+  void ClearReferenceFrames() {
+    reference_frame_id = {};
+    reference_order_hint = {};
+    for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+      reference_frame[ref_index] = nullptr;
+    }
+  }
+
+  // reference_frame_id and current_frame_id have meaningful values and are used
+  // in checks only if sequence_header_.frame_id_numbers_present is true. If
+  // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+  // current_frame_id are assigned the default value 0 and are not used in
+  // checks.
+  std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+  // A valid value of current_frame_id is an unsigned integer of at most 16
+  // bits. -1 indicates current_frame_id is not initialized.
+  int current_frame_id = -1;
+  // The RefOrderHint array variable in the spec.
+  std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+  // The OrderHint variable in the spec. Its value comes from either the
+  // order_hint syntax element in the uncompressed header (if
+  // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+  // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+  // 5.9.2 and Section 7.4.
+  //
+  // NOTE: When show_existing_frame is false, it is often more convenient to
+  // just use the order_hint field of the frame header as OrderHint. So this
+  // field is mainly used to update the reference_order_hint array in
+  // UpdateReferenceFrames().
+  uint8_t order_hint = 0;
+  // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+  // of the motion vector in time for each reference frame.
+  // * |false| indicates that the reference frame is a forwards reference (i.e.
+  //   the reference frame is expected to be output before the current frame);
+  // * |true| indicates that the reference frame is a backwards reference.
+  // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+  std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  // The RefValid[i] variable in the spec does not need to be stored explicitly.
+  // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+  // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+  // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+  // spec is 1, then reference_frame[i] contains a frame buffer pointer.
+  std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/src/decoder_test.cc b/src/decoder_test.cc
new file mode 100644
index 0000000..de7d490
--- /dev/null
+++ b/src/decoder_test.cc
@@ -0,0 +1,352 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+// These two frames come from the libaom test vector av1-1-b8-01-size-32x32.ivf
+constexpr uint8_t kFrame1[] = {
+    0x12, 0x0,  0xa,  0xa,  0x0,  0x0,  0x0,  0x2,  0x27, 0xfe, 0xff, 0xfc,
+    0xc0, 0x20, 0x32, 0x93, 0x2,  0x10, 0x0,  0xa8, 0x80, 0x0,  0x3,  0x0,
+    0x10, 0x10, 0x30, 0x0,  0xd3, 0xc6, 0xc6, 0x82, 0xaa, 0x5e, 0xbf, 0x82,
+    0xf2, 0xa4, 0xa4, 0x29, 0xab, 0xda, 0xd7, 0x1,  0x5,  0x0,  0xb3, 0xde,
+    0xa8, 0x6f, 0x8d, 0xbf, 0x1b, 0xa8, 0x25, 0xc3, 0x84, 0x7c, 0x1a, 0x2b,
+    0x8b, 0x0,  0xff, 0x19, 0x1f, 0x45, 0x7e, 0xe0, 0xbe, 0xe1, 0x3a, 0x63,
+    0xc2, 0xc6, 0x6e, 0xf4, 0xc8, 0xce, 0x11, 0xe1, 0x9f, 0x48, 0x64, 0x72,
+    0xeb, 0xbb, 0x4f, 0xf3, 0x94, 0xb4, 0xb6, 0x9d, 0x4f, 0x4,  0x18, 0x5e,
+    0x5e, 0x1b, 0x65, 0x49, 0x74, 0x90, 0x13, 0x50, 0xef, 0x8c, 0xb8, 0xe8,
+    0xd9, 0x8e, 0x9c, 0xc9, 0x4d, 0xda, 0x60, 0x6a, 0xa,  0xf9, 0x75, 0xd0,
+    0x62, 0x69, 0xd,  0xf5, 0xdc, 0xa9, 0xb9, 0x4c, 0x8,  0x9e, 0x33, 0x15,
+    0xa3, 0xe1, 0x42, 0x0,  0xe2, 0xb0, 0x46, 0xd0, 0xf7, 0xad, 0x55, 0xbc,
+    0x75, 0xe9, 0xe3, 0x1f, 0xa3, 0x41, 0x11, 0xba, 0xaa, 0x81, 0xf3, 0xcb,
+    0x82, 0x87, 0x71, 0x0,  0xe6, 0xb9, 0x8c, 0xe1, 0xe9, 0xd3, 0x21, 0xcc,
+    0xcd, 0xe7, 0x12, 0xb9, 0xe,  0x43, 0x6a, 0xa3, 0x76, 0x5c, 0x35, 0x90,
+    0x45, 0x36, 0x52, 0xb4, 0x2d, 0xa3, 0x55, 0xde, 0x20, 0xf8, 0x80, 0xe1,
+    0x26, 0x46, 0x1b, 0x3f, 0x59, 0xc7, 0x2e, 0x5b, 0x4a, 0x73, 0xf8, 0xb3,
+    0xf4, 0x62, 0xf4, 0xf5, 0xa4, 0xc2, 0xae, 0x9e, 0xa6, 0x9c, 0x10, 0xbb,
+    0xe1, 0xd6, 0x88, 0x75, 0xb9, 0x85, 0x48, 0xe5, 0x7,  0x12, 0xf3, 0x11,
+    0x85, 0x8e, 0xa2, 0x95, 0x9d, 0xed, 0x50, 0xfb, 0x6,  0x5a, 0x1,  0x37,
+    0xc4, 0x8e, 0x9e, 0x73, 0x9b, 0x96, 0x64, 0xbd, 0x42, 0xb,  0x80, 0xde,
+    0x57, 0x86, 0xcb, 0x7d, 0xab, 0x12, 0xb2, 0xcc, 0xe6, 0xea, 0xb5, 0x89,
+    0xeb, 0x91, 0xb3, 0x93, 0xb2, 0x4f, 0x2f, 0x5b, 0xf3, 0x72, 0x12, 0x51,
+    0x56, 0x75, 0xb3, 0xdd, 0x49, 0xb6, 0x5b, 0x77, 0xbe, 0xc5, 0xd7, 0xd4,
+    0xaf, 0xd6, 0x6b, 0x38};
+
+constexpr uint8_t kFrame2[] = {
+    0x12, 0x0,  0x32, 0x33, 0x30, 0x3,  0xc3, 0x0,  0xa7, 0x2e, 0x46,
+    0xa8, 0x80, 0x0,  0x3,  0x0,  0x10, 0x1,  0x0,  0xa0, 0x0,  0xed,
+    0xb1, 0x51, 0x15, 0x58, 0xc7, 0x69, 0x3,  0x26, 0x35, 0xeb, 0x5a,
+    0x2d, 0x7a, 0x53, 0x24, 0x26, 0x20, 0xa6, 0x11, 0x7,  0x49, 0x76,
+    0xa3, 0xc7, 0x62, 0xf8, 0x3,  0x32, 0xb0, 0x98, 0x17, 0x3d, 0x80};
+
+class DecoderTest : public testing::Test {
+ public:
+  void SetUp() override;
+  void IncrementFramesInUse() { ++frames_in_use_; }
+  void DecrementFramesInUse() { --frames_in_use_; }
+  void SetBufferPrivateData(void* buffer_private_data) {
+    buffer_private_data_ = buffer_private_data;
+  }
+  void SetReleasedInputBuffer(void* released_input_buffer) {
+    released_input_buffer_ = released_input_buffer;
+  }
+
+ protected:
+  std::unique_ptr<Decoder> decoder_;
+  int frames_in_use_ = 0;
+  void* buffer_private_data_ = nullptr;
+  void* released_input_buffer_ = nullptr;
+};
+
+struct FrameBufferPrivate {
+  uint8_t* data[3];
+};
+
+extern "C" {
+
+static Libgav1StatusCode GetFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  Libgav1FrameBufferInfo info;
+  Libgav1StatusCode status = Libgav1ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kLibgav1StatusOk) return status;
+
+  std::unique_ptr<FrameBufferPrivate> buffer_private(new (std::nothrow)
+                                                         FrameBufferPrivate);
+  if (buffer_private == nullptr) return kLibgav1StatusOutOfMemory;
+
+  for (int i = 0; i < 3; ++i) {
+    const size_t size = (i == 0) ? info.y_buffer_size : info.uv_buffer_size;
+    buffer_private->data[i] = new (std::nothrow) uint8_t[size];
+    if (buffer_private->data[i] == nullptr) {
+      return kLibgav1StatusOutOfMemory;
+    }
+  }
+
+  uint8_t* const y_buffer = buffer_private->data[0];
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[1] : nullptr;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size != 0) ? buffer_private->data[2] : nullptr;
+
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer,
+                                 buffer_private.release(), frame_buffer);
+  if (status != kLibgav1StatusOk) return status;
+
+  auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+  decoder_test->IncrementFramesInUse();
+  decoder_test->SetBufferPrivateData(frame_buffer->private_data);
+  return kLibgav1StatusOk;
+}
+
+static void ReleaseFrameBuffer(void* callback_private_data,
+                               void* buffer_private_data) {
+  auto* buffer_private = static_cast<FrameBufferPrivate*>(buffer_private_data);
+  for (auto& data : buffer_private->data) {
+    delete[] data;
+  }
+  delete buffer_private;
+  auto* const decoder_test = static_cast<DecoderTest*>(callback_private_data);
+  decoder_test->DecrementFramesInUse();
+}
+
+static void ReleaseInputBuffer(void* private_data, void* input_buffer) {
+  auto* const decoder_test = static_cast<DecoderTest*>(private_data);
+  decoder_test->SetReleasedInputBuffer(input_buffer);
+}
+
+}  // extern "C"
+
+void DecoderTest::SetUp() {
+  decoder_.reset(new (std::nothrow) Decoder());
+  ASSERT_NE(decoder_, nullptr);
+  DecoderSettings settings = {};
+  settings.frame_parallel = false;
+  settings.get_frame_buffer = GetFrameBuffer;
+  settings.release_frame_buffer = ReleaseFrameBuffer;
+  settings.callback_private_data = this;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  ASSERT_EQ(decoder_->Init(&settings), kStatusOk);
+}
+
+TEST_F(DecoderTest, APIFlowForNonFrameParallelMode) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  // In non-frame-parallel mode, decoding happens only in the DequeueFrame call.
+  // So there should be no frames in use yet.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  // libgav1 has decoded frame1 and is holding a reference to it.
+  EXPECT_EQ(frames_in_use_, 1);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Dequeue the output of frame2.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 2);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Signal end of stream (method 1). This should ensure that all the references
+  // are released.
+  status = decoder_->SignalEOS();
+
+  // libgav1 should have released all the reference frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence.
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Dequeue the output of frame2.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 2);
+  EXPECT_EQ(buffer_private_data_, buffer->buffer_private_data);
+
+  // Signal end of stream (method 2). This should ensure that all the references
+  // are released.
+  decoder_ = nullptr;
+
+  // libgav1 should have released all the frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEnqueueMultipleFramesWithoutDequeuing) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  // Until the output of frame1 is dequeued, no other frames can be enqueued.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusTryAgain);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Delete the decoder instance.
+  decoder_ = nullptr;
+
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeEOSBeforeDequeuingLastFrame) {
+  StatusCode status;
+  const DecoderBuffer* buffer;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Signal end of stream before dequeuing the output of frame2.
+  status = decoder_->SignalEOS();
+  ASSERT_EQ(status, kStatusOk);
+
+  // In this case, the output of the last frame that was enqueued is lost (which
+  // is intentional since end of stream was signaled without dequeueing it).
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+TEST_F(DecoderTest, NonFrameParallelModeInvalidFrameAfterEOS) {
+  StatusCode status;
+  const DecoderBuffer* buffer = nullptr;
+
+  // Enqueue frame1 for decoding.
+  status = decoder_->EnqueueFrame(kFrame1, sizeof(kFrame1), 0,
+                                  const_cast<uint8_t*>(kFrame1));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame1.
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusOk);
+  ASSERT_NE(buffer, nullptr);
+  EXPECT_EQ(released_input_buffer_, &kFrame1);
+
+  EXPECT_EQ(frames_in_use_, 1);
+
+  // Signal end of stream.
+  status = decoder_->SignalEOS();
+
+  // libgav1 should have released all the reference frames now.
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Now, the decoder is ready to accept a new coded video sequence. But, we
+  // try to enqueue a frame that does not have a sequence header (which is not
+  // allowed).
+
+  // Enqueue frame2 for decoding.
+  status = decoder_->EnqueueFrame(kFrame2, sizeof(kFrame2), 0,
+                                  const_cast<uint8_t*>(kFrame2));
+  ASSERT_EQ(status, kStatusOk);
+
+  EXPECT_EQ(frames_in_use_, 0);
+
+  // Dequeue the output of frame2 (this will fail since no sequence header has
+  // been seen since the last EOS signal).
+  status = decoder_->DequeueFrame(&buffer);
+  ASSERT_EQ(status, kStatusBitstreamError);
+  EXPECT_EQ(released_input_buffer_, &kFrame2);
+
+  EXPECT_EQ(frames_in_use_, 0);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc
new file mode 100644
index 0000000..3603750
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.cc
@@ -0,0 +1,284 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kInterPostRoundBit =
+    kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline uint8x8_t AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                  const int16_t* LIBGAV1_RESTRICT
+                                      prediction_1) {
+  const int16x8_t pred0 = vld1q_s16(prediction_0);
+  const int16x8_t pred1 = vld1q_s16(prediction_1);
+  const int16x8_t res = vaddq_s16(pred0, pred1);
+  return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
+  int x = width;
+  do {
+    const int16x8_t pred_00 = vld1q_s16(prediction_0);
+    const int16x8_t pred_01 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
+    const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
+    const int16x8_t pred_10 = vld1q_s16(prediction_0);
+    const int16x8_t pred_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
+    const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
+    vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+    dest += 16;
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    do {
+      const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      StoreLo4(dst, result);
+      dst += dest_stride;
+      StoreHi4(dst, result);
+      dst += dest_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
+      dst += dest_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    const int32x4_t compound_offset, const uint16x8_t v_bitdepth) {
+  const uint16x8_t pred0 = vld1q_u16(prediction_0);
+  const uint16x8_t pred1 = vld1q_u16(prediction_1);
+  const uint32x4_t pred_lo =
+      vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+  const uint32x4_t pred_hi =
+      vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+  const int32x4_t offset_lo =
+      vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+  const int32x4_t offset_hi =
+      vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+  const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+  const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+  return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const uint16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint16_t* LIBGAV1_RESTRICT dest,
+                                 const int32x4_t compound_offset,
+                                 const uint16x8_t v_bitdepth) {
+  int x = width;
+  do {
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                       const void* LIBGAV1_RESTRICT prediction_1,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = height;
+
+  const ptrdiff_t dst_stride = dest_stride >> 1;
+  const int32x4_t compound_offset =
+      vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+  const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  if (width == 4) {
+    do {
+      const uint16x8_t result =
+          AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u16(dst, vget_low_u16(result));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(result));
+      dst += dst_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/average_blend_neon.h b/src/dsp/arm/average_blend_neon.h
new file mode 100644
index 0000000..d13bcd6
--- /dev/null
+++ b/src/dsp/arm/average_blend_neon.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc
new file mode 100644
index 0000000..da271f2
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.cc
@@ -0,0 +1,804 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  // 00 01 02 03 04 05 06 07
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+  // 17 00 00 00 00 00 00 00
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  uint8x16_t v_d1_temp[8];
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = vdupq_n_u16(0);
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+  v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+  v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+  v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = vdupq_n_u16(0);
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* LIBGAV1_RESTRICT const source,
+                                      ptrdiff_t stride, uint16x8_t* partial_lo,
+                                      uint16x8_t* partial_hi) {
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  uint8x8_t v_src[8];
+  if (bitdepth == kBitdepth8) {
+    for (auto& v : v_src) {
+      v = vld1_u8(src);
+      src += stride;
+    }
+  } else {
+    // bitdepth - 8
+    constexpr int src_shift = (bitdepth == kBitdepth10) ? 2 : 4;
+    for (auto& v : v_src) {
+      v = vshrn_n_u16(vld1q_u16(reinterpret_cast<const uint16_t*>(src)),
+                      src_shift);
+      src += stride;
+    }
+  }
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+  for (int i = 2; i < 8; ++i) {
+    partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+  uint8x8_t v_src_reverse[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = vrev64_u8(v_src[i]);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+  return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+                  const uint32x4_t division_table[4]) {
+  uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+  c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+  return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+  uint32x4_t c = Square(vget_low_u16(a));
+  c = SquareAccumulate(c, vget_high_u16(a));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+  return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+                 const uint32x4_t division_table[2]) {
+  // Remove elements 0-2.
+  uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+  c = vaddq_u32(c, Square(vget_high_u16(a)));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+  c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+  return SumVector(c);
+}
+
+template <int bitdepth>
+void CdefDirection_NEON(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  uint32_t cost[8];
+  uint16x8_t partial_lo[8], partial_hi[8];
+
+  AddPartial<bitdepth>(src, stride, partial_lo, partial_hi);
+
+  cost[2] = SquareAccumulate(partial_lo[2]);
+  cost[6] = SquareAccumulate(partial_lo[6]);
+
+  const uint32x4_t division_table[4] = {
+      vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+      vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const uint32x4_t division_table_odd[2] = {
+      vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+  const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+  cost[1] =
+      CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+  cost[3] =
+      CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+  cost[5] =
+      CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+  cost[7] =
+      CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                   const ptrdiff_t stride, uint16x8_t* output,
+                   const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vld1q_u16(src + y_0 * stride + x_0);
+  output[1] = vld1q_u16(src - y_0 * stride - x_0);
+  output[2] = vld1q_u16(src + y_1 * stride + x_1);
+  output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, uint16x8_t* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+                           vld1_u16(src + y_0 * stride + stride + x_0));
+  output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+                           vld1_u16(src - y_0 * stride + stride - x_0));
+  output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+                           vld1_u16(src + y_1 * stride + stride + x_1));
+  output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+                           vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+                    const uint16x8_t threshold, const int16x8_t damping) {
+  // If reference > pixel, the difference will be negative, so convert to 0 or
+  // -1.
+  const uint16x8_t sign = vcgtq_u16(reference, pixel);
+  const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+  const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const uint16x8_t thresh_minus_shifted_diff =
+      vqsubq_u16(threshold, shifted_diff);
+  const uint16x8_t clamp_abs_diff =
+      vminq_u16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return vreinterpretq_s16_u16(
+      vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxPrimary(uint16x8_t* primary_val, uint16x8_t max,
+                         uint16x8_t cdef_large_value_mask) {
+  if (sizeof(Pixel) == 1) {
+    // The source is 16 bits, however, we only really care about the lower
+    // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+    // primary max has been calculated, zero out the upper 8 bits.  Use this
+    // to find the "16 bit" max.
+    const uint8x16_t max_p01 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+                                        vreinterpretq_u8_u16(primary_val[1]));
+    const uint8x16_t max_p23 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+                                        vreinterpretq_u8_u16(primary_val[3]));
+    const uint16x8_t max_p = vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+    max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+  } else {
+    // Convert kCdefLargeValue to 0 before calculating max.
+    max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
+  }
+  return max;
+}
+
+template <typename Pixel>
+uint16x8_t GetMaxSecondary(uint16x8_t* secondary_val, uint16x8_t max,
+                           uint16x8_t cdef_large_value_mask) {
+  if (sizeof(Pixel) == 1) {
+    const uint8x16_t max_s01 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+                                        vreinterpretq_u8_u16(secondary_val[1]));
+    const uint8x16_t max_s23 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+                                        vreinterpretq_u8_u16(secondary_val[3]));
+    const uint8x16_t max_s45 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+                                        vreinterpretq_u8_u16(secondary_val[5]));
+    const uint8x16_t max_s67 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+                                        vreinterpretq_u8_u16(secondary_val[7]));
+    const uint16x8_t max_s = vreinterpretq_u16_u8(
+        vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+    max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+  } else {
+    max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
+    max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
+  }
+  return max;
+}
+
+template <typename Pixel, int width>
+void StorePixels(void* dest, ptrdiff_t dst_stride, int16x8_t result) {
+  auto* const dst8 = static_cast<uint8_t*>(dest);
+  if (sizeof(Pixel) == 1) {
+    const uint8x8_t dst_pixel = vqmovun_s16(result);
+    if (width == 8) {
+      vst1_u8(dst8, dst_pixel);
+    } else {
+      StoreLo4(dst8, dst_pixel);
+      StoreHi4(dst8 + dst_stride, dst_pixel);
+    }
+  } else {
+    const uint16x8_t dst_pixel = vreinterpretq_u16_s16(result);
+    auto* const dst16 = reinterpret_cast<uint16_t*>(dst8);
+    if (width == 8) {
+      vst1q_u16(dst16, dst_pixel);
+    } else {
+      auto* const dst16_next_row =
+          reinterpret_cast<uint16_t*>(dst8 + dst_stride);
+      vst1_u16(dst16, vget_low_u16(dst_pixel));
+      vst1_u16(dst16_next_row, vget_high_u16(dst_pixel));
+    }
+  }
+}
+
+template <int width, typename Pixel, bool enable_primary = true,
+          bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+  const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+  const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+  int16x8_t primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  // 10-bit damping range: Y: [3, 6 + 2], UV: [2, 5 + 2].
+  if (enable_primary) {
+    // 8-bit primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is
+    // necessary for UV filtering.
+    // 10-bit primary_strength: [0, 15 << 2].
+    primary_damping_shift =
+        vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+  }
+
+  if (enable_secondary) {
+    if (sizeof(Pixel) == 1) {
+      // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+      // necessary.
+      assert(damping - FloorLog2(secondary_strength) >= 0);
+      secondary_damping_shift =
+          vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+    } else {
+      // secondary_strength: [0, 4 << 2]
+      secondary_damping_shift =
+          vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
+    }
+  }
+
+  constexpr int coeff_shift = (sizeof(Pixel) == 1) ? 0 : kBitdepth10 - 8;
+  const int primary_tap_0 =
+      kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][0];
+  const int primary_tap_1 =
+      kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][1];
+
+  int y = height;
+  do {
+    uint16x8_t pixel;
+    if (width == 8) {
+      pixel = vld1q_u16(src);
+    } else {
+      pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+    }
+
+    uint16x8_t min = pixel;
+    uint16x8_t max = pixel;
+    int16x8_t sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      uint16x8_t primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, primary_val[0]);
+        min = vminq_u16(min, primary_val[1]);
+        min = vminq_u16(min, primary_val[2]);
+        min = vminq_u16(min, primary_val[3]);
+
+        max = GetMaxPrimary<Pixel>(primary_val, max, cdef_large_value_mask);
+      }
+
+      sum = Constrain(primary_val[0], pixel, primary_threshold,
+                      primary_damping_shift);
+      sum = vmulq_n_s16(sum, primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[1], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[2], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[3], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+    } else {
+      sum = vdupq_n_s16(0);
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      uint16x8_t secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, secondary_val[0]);
+        min = vminq_u16(min, secondary_val[1]);
+        min = vminq_u16(min, secondary_val[2]);
+        min = vminq_u16(min, secondary_val[3]);
+        min = vminq_u16(min, secondary_val[4]);
+        min = vminq_u16(min, secondary_val[5]);
+        min = vminq_u16(min, secondary_val[6]);
+        min = vminq_u16(min, secondary_val[7]);
+
+        max = GetMaxSecondary<Pixel>(secondary_val, max, cdef_large_value_mask);
+      }
+
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[0], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[1], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[2], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[3], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[4], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[5], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[6], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[7], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+    sum = vaddq_s16(sum, sum_lt_0);
+    int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+    if (clipping_required) {
+      result = vminq_s16(result, vreinterpretq_s16_u16(max));
+      result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+    }
+
+    StorePixels<Pixel, width>(dst, dst_stride, result);
+
+    src += (width == 8) ? src_stride : src_stride << 1;
+    dst += (width == 8) ? dst_stride : dst_stride << 1;
+    y -= (width == 8) ? 1 : 2;
+  } while (y != 0);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON<kBitdepth8>;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_NEON<4, uint8_t, /*enable_primary=*/true,
+                                            /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_NEON<4, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_NEON<8, uint8_t, /*enable_primary=*/true,
+                                            /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_NEON<8, uint8_t, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON<kBitdepth10>;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_NEON<4, uint16_t, /*enable_primary=*/true,
+                      /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_NEON<4, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_NEON<8, uint16_t, /*enable_primary=*/true,
+                      /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_NEON<8, uint16_t, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void CdefInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/cdef_neon.h b/src/dsp/arm/cdef_neon.h
new file mode 100644
index 0000000..ef8ed3c
--- /dev/null
+++ b/src/dsp/arm/cdef_neon.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
new file mode 100644
index 0000000..9c46525
--- /dev/null
+++ b/src/dsp/arm/common_neon.h
@@ -0,0 +1,1208 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/utils/compiler_attributes.h"
+
+#if 0
+#include <cstdio>
+#include <string>
+
+constexpr bool kEnablePrintRegs = true;
+
+union DebugRegister {
+  int8_t i8[8];
+  int16_t i16[4];
+  int32_t i32[2];
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+};
+
+union DebugRegisterQ {
+  int8_t i8[16];
+  int16_t i16[8];
+  int32_t i32[4];
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+};
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintVect(const DebugRegister r, const char* const name, int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+// Debugging macro for 128-bit types.
+inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
+                       int size) {
+  int n;
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s\t: ", name);
+    if (size == 8) {
+      for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
+    } else if (size == 16) {
+      for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
+    } else if (size == 32) {
+      for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+  DebugRegisterQ r;
+  vst1q_s32(r.i32, val.val[0]);
+  const std::string name0 = name + std::string(".val[0]");
+  PrintVectQ(r, name0.c_str(), 32);
+  vst1q_s32(r.i32, val.val[1]);
+  const std::string name1 = name + std::string(".val[1]");
+  PrintVectQ(r, name1.c_str(), 32);
+}
+
+inline void PrintReg(const uint32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u32(r.u32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const uint32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_u32(r.u32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const uint16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u16(r.u16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const uint16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_u16(r.u16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const uint8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_u8(r.u8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const uint8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_u8(r.u8, val);
+  PrintVect(r, name, 8);
+}
+
+inline void PrintReg(const int32x4_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s32(r.i32, val);
+  PrintVectQ(r, name, 32);
+}
+
+inline void PrintReg(const int32x2_t val, const char* name) {
+  DebugRegister r;
+  vst1_s32(r.i32, val);
+  PrintVect(r, name, 32);
+}
+
+inline void PrintReg(const int16x8_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s16(r.i16, val);
+  PrintVectQ(r, name, 16);
+}
+
+inline void PrintReg(const int16x4_t val, const char* name) {
+  DebugRegister r;
+  vst1_s16(r.i16, val);
+  PrintVect(r, name, 16);
+}
+
+inline void PrintReg(const int8x16_t val, const char* name) {
+  DebugRegisterQ r;
+  vst1q_s8(r.i8, val);
+  PrintVectQ(r, name, 8);
+}
+
+inline void PrintReg(const int8x8_t val, const char* name) {
+  DebugRegister r;
+  vst1_s8(r.i8, val);
+  PrintVect(r, name, 8);
+}
+
+// Print an individual (non-vector) value in decimal format.
+inline void PrintReg(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s: %d\n", name, x);
+  }
+}
+
+// Print an individual (non-vector) value in hexadecimal format.
+inline void PrintHex(const int x, const char* name) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "%s: %x\n", name, x);
+  }
+}
+
+#define PR(x) PrintReg(x, #x)
+#define PD(x) PrintReg(x, #x)
+#define PX(x) PrintHex(x, #x)
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  if (kEnablePrintRegs) {
+    fprintf(stderr, "Shadow for %s:\n", name);
+    __msan_print_shadow(r, size);
+  }
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif  // LIBGAV1_MSAN
+
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+  const uint16x4_t zero = vdup_n_u16(0);
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(
+      vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+template <int lane>
+inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u16_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+  const uint32x2_t zero = vdup_n_u32(0);
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
+}
+
+// Convenience functions for 16-bit loads from a uint8_t* source.
+inline uint16x4_t Load4U16(const void* const buf) {
+  return vld1_u16(static_cast<const uint16_t*>(buf));
+}
+
+inline uint16x8_t Load8U16(const void* const buf) {
+  return vld1q_u16(static_cast<const uint16_t*>(buf));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline uint8x8_t MaskOverreads(const uint8x8_t source,
+                               const ptrdiff_t over_read_in_bytes) {
+  uint8x8_t dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    uint8x8_t mask = vdup_n_u8(0);
+    uint8x8_t valid_element_mask = vdup_n_u8(-1);
+    const int valid_bytes =
+        std::min(8, 8 - static_cast<int>(over_read_in_bytes));
+    for (int i = 0; i < valid_bytes; ++i) {
+      // Feed ff bytes into |mask| one at a time.
+      mask = vext_u8(valid_element_mask, mask, 7);
+    }
+    dst = vand_u8(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  uint8x16_t dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    uint8x16_t mask = vdupq_n_u8(0);
+    uint8x16_t valid_element_mask = vdupq_n_u8(-1);
+    const int valid_bytes =
+        std::min(16, 16 - static_cast<int>(over_read_in_bytes));
+    for (int i = 0; i < valid_bytes; ++i) {
+      // Feed ff bytes into |mask| one at a time.
+      mask = vextq_u8(valid_element_mask, mask, 15);
+    }
+    dst = vandq_u8(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline uint8x8_t Load1MsanU8(const uint8_t* const source,
+                             const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(vld1_u8(source), over_read_in_bytes);
+}
+
+inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
+                               const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
+}
+
+inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
+                                const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u16_u8(MaskOverreadsQ(
+      vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
+}
+
+inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source,
+                                  const ptrdiff_t over_read_in_bytes) {
+  // Relative source index of elements (2 bytes each):
+  // dst.val[0]: 00 02 04 06 08 10 12 14
+  // dst.val[1]: 01 03 05 07 09 11 13 15
+  uint16x8x2_t dst = vld2q_u16(source);
+  dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ(
+      vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1));
+  dst.val[1] = vreinterpretq_u16_u8(
+      MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]),
+                     (over_read_in_bytes >> 1) + (over_read_in_bytes % 4)));
+  return dst;
+}
+
+inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
+                                const ptrdiff_t over_read_in_bytes) {
+  return vreinterpretq_u32_u8(MaskOverreadsQ(
+      vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+  memcpy(buf, &val, sizeof(val));
+}
+
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+  ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
+}
+
+// Store 4 uint8_t values from the low half of a uint8x8_t register.
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+}
+
+// Store 4 uint8_t values from the high half of a uint8x8_t register.
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x8_t val) {
+  ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint16x4_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+  vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+  vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
+//------------------------------------------------------------------------------
+// Pointer helpers.
+
+// This function adds |stride|, given as a number of bytes, to a pointer to a
+// larger type, using native pointer arithmetic.
+template <typename T>
+inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
+  return reinterpret_cast<T*>(
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
+}
+
+//------------------------------------------------------------------------------
+// Multiply.
+
+// Shim vmull_high_u16 for armv7.
+inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmull_high_u16(a, b);
+#else
+  return vmull_u16(vget_high_u16(a), vget_high_u16(b));
+#endif
+}
+
+// Shim vmull_high_s16 for armv7.
+inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
+#if defined(__aarch64__)
+  return vmull_high_s16(a, b);
+#else
+  return vmull_s16(vget_high_s16(a), vget_high_s16(b));
+#endif
+}
+
+// Shim vmlal_high_u16 for armv7.
+inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
+                               const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmlal_high_u16(a, b, c);
+#else
+  return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
+#endif
+}
+
+// Shim vmlal_high_s16 for armv7.
+inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
+                              const int16x8_t c) {
+#if defined(__aarch64__)
+  return vmlal_high_s16(a, b, c);
+#else
+  return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
+#endif
+}
+
+// Shim vmul_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmul_laneq_u16(a, b, lane);
+#else
+  if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
+  return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmulq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
+#if defined(__aarch64__)
+  return vmulq_laneq_u16(a, b, lane);
+#else
+  if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
+  return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmla_laneq_u16 for armv7.
+template <int lane>
+inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
+                               const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmla_laneq_u16(a, b, c, lane);
+#else
+  if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+  return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+// Shim vmlaq_laneq_u16 for armv7.
+template <int lane>
+inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t c) {
+#if defined(__aarch64__)
+  return vmlaq_laneq_u16(a, b, c, lane);
+#else
+  if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
+  return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Bit manipulation.
+
+// vshXX_n_XX() requires an immediate.
+template <int shift>
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
+  return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
+}
+
+// Shim vqtbl1_u8 for armv7.
+inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_u8(a, index);
+#else
+  const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
+  return vtbl2_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2_u8 for armv7.
+inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl2_u8(a, index);
+#else
+  const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+                         vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+  return vtbl4_u8(b, index);
+#endif
+}
+
+// Shim vqtbl2q_u8 for armv7.
+inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+  return vqtbl2q_u8(a, index);
+#else
+  return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
+                     VQTbl2U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl3_u8(a, index);
+#else
+  const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
+                         vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
+  const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
+  const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
+  const uint8x8_t partial_lookup = vtbl4_u8(b, index);
+  return vtbx2_u8(partial_lookup, c, index_ext);
+#endif
+}
+
+// Shim vqtbl3q_u8 for armv7.
+inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
+#if defined(__aarch64__)
+  return vqtbl3q_u8(a, index);
+#else
+  return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
+                     VQTbl3U8(a, vget_high_u8(index)));
+#endif
+}
+
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_s8(a, index);
+#else
+  const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+  return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Saturation helpers.
+
+inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+  return vmin_s16(vmax_s16(val, low), high);
+}
+
+inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
+                          const int16x8_t high) {
+  return vminq_s16(vmaxq_s16(val, low), high);
+}
+
+inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+  const int16x8_t low = vdupq_n_s16(0);
+  const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+//------------------------------------------------------------------------------
+// Interleave.
+
+// vzipN is exclusive to A64.
+inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vzip1_u8(a, b);
+#else
+  // Discard |.val[1]|
+  return vzip_u8(a, b).val[0];
+#endif
+}
+
+inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
+#endif
+}
+
+inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[1]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
+#endif
+}
+
+inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_u8_u32(
+      vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
+#endif
+}
+
+inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
+#if defined(__aarch64__)
+  return vreinterpret_s8_u32(
+      vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
+#else
+  // Discard |.val[0]|
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t c = vpaddl_u8(a);
+  const uint32x2_t d = vpaddl_u16(c);
+  const uint64x1_t e = vpaddl_u32(d);
+  return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
+// Transpose.
+
+// Transpose 32 bit elements such that:
+// a: 00 01
+// b: 02 03
+// returns
+// val[0]: 00 02
+// val[1]: 01 03
+inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
+  const uint32x2_t a_32 = vreinterpret_u32_u8(a);
+  const uint32x2_t b_32 = vreinterpret_u32_u8(b);
+  const uint32x2x2_t c = vtrn_u32(a_32, b_32);
+  const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
+                         vreinterpret_u8_u32(c.val[1])};
+  return d;
+}
+
+// Swap high and low 32 bit elements.
+inline uint8x8_t Transpose32(const uint8x8_t a) {
+  const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
+  return vreinterpret_u8_u32(b);
+}
+
+// Swap high and low halves.
+inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
+
+// Implement vtrnq_s64().
+// Input:
+// a0: 00 01 02 03 04 05 06 07
+// a1: 16 17 18 19 20 21 22 23
+// Output:
+// b0.val[0]: 00 01 02 03 16 17 18 19
+// b0.val[1]: 04 05 06 07 20 21 22 23
+inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+  uint16x8x2_t b0;
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+  return b0;
+}
+
+// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
+// a: 00 01 02 03 10 11 12 13
+// b: 20 21 22 23 30 31 32 33
+// Output:
+// Note that columns [1] and [2] are transposed.
+// a: 00 10 20 30 02 12 22 32
+// b: 01 11 21 31 03 13 23 33
+inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
+  const uint16x4x2_t c =
+      vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
+  const uint8x8x2_t e =
+      vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
+  *a = e.val[0];
+  *b = e.val[1];
+}
+
+// 4x8 Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 00 10 20 30 04 14 24 34
+// a[1]: 01 11 21 31 05 15 25 35
+// a[2]: 02 12 22 32 06 16 26 36
+// a[3]: 03 13 23 33 07 17 27 37
+inline void Transpose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  a[0] = vreinterpretq_u16_u32(c0.val[0]);
+  a[1] = vreinterpretq_u16_u32(c1.val[0]);
+  a[2] = vreinterpretq_u16_u32(c0.val[1]);
+  a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q:  p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34  p0q0
+// a[1]: 02 12 22 32 05 15 25 35  p1q1
+// a[2]: 01 11 21 31 06 16 26 36  p2q2
+// a[3]: 00 10 20 30 07 17 27 37  p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q:  p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard Transpose4x8 will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // Reverse odd vectors to bring the appropriate items to the front of zips.
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // r0       : 03 13 01 11 07 17 05 15
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // r1       : 23 33 21 31 27 37 25 35
+  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+  // Zip to complete the halves.
+  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
+  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
+  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
+  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
+  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
+  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
+  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
+  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
+  // The third row of c comes first here to swap p2 with q0.
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
+
+  // 8x4 Output:
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  a[0] = d1.val[0];  // p0q0
+  a[1] = d0.val[1];  // p1q1
+  a[2] = d1.val[1];  // p2q2
+  a[3] = d0.val[0];  // p3q3
+}
+
+// Reversible if the x4 values are packed next to each other.
+// x4 input / x8 output:
+// a0: 00 01 02 03 40 41 42 43 44
+// a1: 10 11 12 13 50 51 52 53 54
+// a2: 20 21 22 23 60 61 62 63 64
+// a3: 30 31 32 33 70 71 72 73 74
+// x8 input / x4 output:
+// a0: 00 10 20 30 40 50 60 70
+// a1: 01 11 21 31 41 51 61 71
+// a2: 02 12 22 32 42 52 62 72
+// a3: 03 13 23 33 43 53 63 73
+inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
+                         uint8x8_t* a3) {
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int8x8_t a[8]) {
+  // Swap 8 bit elements. Goes from:
+  // a[0]: 00 01 02 03 04 05 06 07
+  // a[1]: 10 11 12 13 14 15 16 17
+  // a[2]: 20 21 22 23 24 25 26 27
+  // a[3]: 30 31 32 33 34 35 36 37
+  // a[4]: 40 41 42 43 44 45 46 47
+  // a[5]: 50 51 52 53 54 55 56 57
+  // a[6]: 60 61 62 63 64 65 66 67
+  // a[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+  const int8x16x2_t b0 =
+      vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
+  const int8x16x2_t b1 =
+      vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+  const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
+                                   vreinterpretq_s16_s8(b1.val[0]));
+  const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
+                                   vreinterpretq_s16_s8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+
+  a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
+  a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
+  a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
+  a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
+  a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
+  a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
+  a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
+  a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
+}
+
+// Unsigned.
+inline void Transpose8x8(uint8x8_t a[8]) {
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+  const uint8x16x2_t a0 =
+      vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+  const uint8x16x2_t a1 =
+      vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+  const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+                                    vreinterpretq_u16_u8(a1.val[0]));
+  const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+                                    vreinterpretq_u16_u8(a1.val[1]));
+
+  const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  out[0] = vreinterpretq_u8_u32(c0.val[0]);
+  out[1] = vreinterpretq_u8_u32(c1.val[0]);
+  out[2] = vreinterpretq_u8_u32(c0.val[1]);
+  out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
+inline void Transpose8x8(int16x8_t a[8]) {
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Unsigned.
+inline void Transpose8x8(uint16x8_t a[8]) {
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+  const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
+  const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
+
+  a[0] = d0.val[0];
+  a[1] = d1.val[0];
+  a[2] = d2.val[0];
+  a[3] = d3.val[0];
+  a[4] = d0.val[1];
+  a[5] = d1.val[1];
+  a[6] = d2.val[1];
+  a[7] = d3.val[1];
+}
+
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+  // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+  // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
+  // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
+  // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
+  // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
+  // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
+  // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
+  const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+  const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+  const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+  const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
+  // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
+  // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
+  // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
+  // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
+  // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
+  // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
+  // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                                    vreinterpretq_u16_u8(b3.val[0]));
+  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                                    vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+  // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+  // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+  // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+  // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+  // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+  // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+  // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c2.val[0]));
+  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                                    vreinterpretq_u32_u16(c3.val[0]));
+  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c2.val[1]));
+  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                                    vreinterpretq_u32_u16(c3.val[1]));
+
+  a[0] = vreinterpretq_u8_u32(d0.val[0]);
+  a[1] = vreinterpretq_u8_u32(d1.val[0]);
+  a[2] = vreinterpretq_u8_u32(d2.val[0]);
+  a[3] = vreinterpretq_u8_u32(d3.val[0]);
+  a[4] = vreinterpretq_u8_u32(d0.val[1]);
+  a[5] = vreinterpretq_u8_u32(d1.val[1]);
+  a[6] = vreinterpretq_u8_u32(d2.val[1]);
+  a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+  return vreinterpretq_s16_u16(vmovl_u8(in));
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
diff --git a/src/dsp/arm/common_neon_test.cc b/src/dsp/arm/common_neon_test.cc
new file mode 100644
index 0000000..03aed19
--- /dev/null
+++ b/src/dsp/arm/common_neon_test.cc
@@ -0,0 +1,208 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <cstdint>
+
+#include "tests/block_utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockWidth = 16;
+constexpr int kMaxBlockHeight = 16;
+
+template <typename Pixel>
+class TransposeTest : public testing::Test {
+ public:
+  TransposeTest() {
+    for (int y = 0; y < kMaxBlockHeight; ++y) {
+      for (int x = 0; x < kMaxBlockWidth; ++x) {
+        src_block_[y][x] = y * 16 + x;
+        expected_transpose_[y][x] = x * 16 + y;
+      }
+    }
+  }
+
+  TransposeTest(const TransposeTest&) = delete;
+  TransposeTest& operator=(const TransposeTest&) = delete;
+  ~TransposeTest() override = default;
+
+ protected:
+  Pixel src_block_[kMaxBlockHeight][kMaxBlockWidth];
+  Pixel expected_transpose_[kMaxBlockHeight][kMaxBlockWidth];
+};
+
+using TransposeTestLowBitdepth = TransposeTest<uint8_t>;
+
+TEST_F(TransposeTestLowBitdepth, Transpose4x4Test) {
+  uint8x8_t a = Load4<1>(src_block_[1], Load4(src_block_[0]));
+  uint8x8_t b = Load4<1>(src_block_[3], Load4(src_block_[2]));
+  Transpose4x4(&a, &b);
+  uint8_t output_4x4[4][4];
+  StoreLo4(output_4x4[0], a);
+  StoreLo4(output_4x4[1], b);
+  StoreHi4(output_4x4[2], a);
+  StoreHi4(output_4x4[3], b);
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+                                        4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x4Test) {
+  uint8x8_t a0 = Load4<1>(src_block_[4], Load4(src_block_[0]));
+  uint8x8_t a1 = Load4<1>(src_block_[5], Load4(src_block_[1]));
+  uint8x8_t a2 = Load4<1>(src_block_[6], Load4(src_block_[2]));
+  uint8x8_t a3 = Load4<1>(src_block_[7], Load4(src_block_[3]));
+  Transpose8x4(&a0, &a1, &a2, &a3);
+  uint8_t output_8x4[4][8];
+  vst1_u8(output_8x4[0], a0);
+  vst1_u8(output_8x4[1], a1);
+  vst1_u8(output_8x4[2], a2);
+  vst1_u8(output_8x4[3], a3);
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x4[0],
+                                        8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x8Test) {
+  uint8x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vld1_u8(src_block_[i]);
+  }
+  Transpose8x8(input_8x8);
+  uint8_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1_u8(output_8x8[i], input_8x8[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestLowBitdepth, Transpose8x16Test) {
+  uint8x16_t input_8x16[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x16[i] =
+        vcombine_u8(vld1_u8(src_block_[i]), vld1_u8(src_block_[i + 8]));
+  }
+  Transpose8x16(input_8x16);
+  uint8_t output_16x8[8][16];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u8(output_16x8[i], input_8x16[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_16x8[0],
+                                        16, 8, kMaxBlockWidth, 16, false));
+}
+
+using TransposeTestHighBitdepth = TransposeTest<uint16_t>;
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x4Test) {
+  uint16x4_t input_4x4[4];
+  input_4x4[0] = vld1_u16(src_block_[0]);
+  input_4x4[1] = vld1_u16(src_block_[1]);
+  input_4x4[2] = vld1_u16(src_block_[2]);
+  input_4x4[3] = vld1_u16(src_block_[3]);
+  Transpose4x4(input_4x4);
+  uint16_t output_4x4[4][4];
+  for (int i = 0; i < 4; ++i) {
+    vst1_u16(output_4x4[i], input_4x4[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x4[0],
+                                        4, 4, kMaxBlockWidth, 4, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose4x8Test) {
+  uint16x8_t input_4x8[4];
+  for (int i = 0; i < 4; ++i) {
+    input_4x8[i] = vld1q_u16(src_block_[i]);
+  }
+  Transpose4x8(input_4x8);
+  uint16_t output_4x8[4][8];
+  for (int i = 0; i < 4; ++i) {
+    vst1q_u16(output_4x8[i], input_4x8[i]);
+    memcpy(&expected_transpose_[i][4], &expected_transpose_[i + 4][0],
+           4 * sizeof(expected_transpose_[0][0]));
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_4x8[0],
+                                        8, 4, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, LoopFilterTranspose4x8Test) {
+  uint16x8_t input_4x8[4];
+  for (int i = 0; i < 4; ++i) {
+    input_4x8[i] = vld1q_u16(src_block_[i]);
+  }
+  LoopFilterTranspose4x8(input_4x8);
+  uint16_t output_4x8[4][8];
+  for (int i = 0; i < 4; ++i) {
+    vst1q_u16(output_4x8[i], input_4x8[i]);
+  }
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  static constexpr uint16_t expected_output[4][8] = {
+      {0x03, 0x13, 0x23, 0x33, 0x04, 0x14, 0x24, 0x34},
+      {0x02, 0x12, 0x22, 0x32, 0x05, 0x15, 0x25, 0x35},
+      {0x01, 0x11, 0x21, 0x31, 0x06, 0x16, 0x26, 0x36},
+      {0x00, 0x10, 0x20, 0x30, 0x07, 0x17, 0x27, 0x37},
+  };
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_output[0], output_4x8[0], 8, 4,
+                                        8, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8Test) {
+  uint16x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vld1q_u16(src_block_[i]);
+  }
+  Transpose8x8(input_8x8);
+  uint16_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u16(output_8x8[i], input_8x8[i]);
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+TEST_F(TransposeTestHighBitdepth, Transpose8x8SignedTest) {
+  int16x8_t input_8x8[8];
+  for (int i = 0; i < 8; ++i) {
+    input_8x8[i] = vreinterpretq_s16_u16(vld1q_u16(src_block_[i]));
+  }
+  Transpose8x8(input_8x8);
+  uint16_t output_8x8[8][8];
+  for (int i = 0; i < 8; ++i) {
+    vst1q_u16(output_8x8[i], vreinterpretq_u16_s16(input_8x8[i]));
+  }
+  EXPECT_TRUE(test_utils::CompareBlocks(expected_transpose_[0], output_8x8[0],
+                                        8, 8, kMaxBlockWidth, 8, false));
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_ENABLE_NEON
+
+TEST(CommonDspTest, NEON) {
+  GTEST_SKIP()
+      << "Build this module for Arm with NEON enabled to enable the tests.";
+}
+
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
new file mode 100644
index 0000000..b7205df
--- /dev/null
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -0,0 +1,3008 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Output of ConvolveTest.ShowRange below.
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   Horizontal base upscaled range:    [  -28644,    94116]
+//   Horizontal halved upscaled range:  [  -14322,    47085]
+//   Horizontal downscaled range:       [   -7161,    23529]
+//   Vertical upscaled range:           [-1317624,  2365176]
+//   Pixel output range:                [       0,     1023]
+//   Compound output range:             [    3988,    61532]
+
+template <int filter_index>
+int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
+                           const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
+  int32x4x2_t sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[4]), taps[4]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[5]), taps[5]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[6]), taps[6]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[7]), taps[7]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+  } else {
+    // 4 taps.
+    sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[2]), taps[2]);
+    sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[3]), taps[3]);
+
+    sum.val[1] = vmull_s16(vget_high_s16(ssrc[0]), taps[0]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[1]), taps[1]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[2]), taps[2]);
+    sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
+  }
+  return sum;
+}
+
+template <int filter_index>
+int32x4_t SumOnePassTaps(const uint16x4_t* const src,
+                         const int16x4_t* const taps) {
+  const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
+  int32x4_t sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+    sum = vmlal_s16(sum, ssrc[4], taps[4]);
+    sum = vmlal_s16(sum, ssrc[5], taps[5]);
+    sum = vmlal_s16(sum, ssrc[6], taps[6]);
+    sum = vmlal_s16(sum, ssrc[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+  } else {
+    // 4 taps.
+    sum = vmull_s16(ssrc[0], taps[0]);
+    sum = vmlal_s16(sum, ssrc[1], taps[1]);
+    sum = vmlal_s16(sum, ssrc[2], taps[2]);
+    sum = vmlal_s16(sum, ssrc[3], taps[3]);
+  }
+  return sum;
+}
+
+template <int filter_index, bool is_compound, bool is_2d>
+void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
+                                 const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  if (is_2d) {
+    int x = 0;
+    do {
+      const uint16_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint16x8_t src_long = vld1q_u16(s);
+        const uint16x8_t src_long_hi = vld1q_u16(s + 8);
+        uint16x8_t v_src[8];
+        int32x4x2_t v_sum;
+        if (filter_index < 2) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+          v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+          v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+          v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+          v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+        } else {  // filter_index > 3
+          v_src[0] = src_long;
+          v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+          v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+          v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+          v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+        }
+
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+        vst1_u16(&dest16[4], vreinterpret_u16_s16(d1));
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+    return;
+  }
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const uint16x8_t src_long = vld1q_u16(src + x);
+      const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
+      uint16x8_t v_src[8];
+      int32x4x2_t v_sum;
+      if (filter_index < 2) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
+      } else if (filter_index == 2) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_src[4] = vextq_u16(src_long, src_long_hi, 4);
+        v_src[5] = vextq_u16(src_long, src_long_hi, 5);
+        v_src[6] = vextq_u16(src_long, src_long_hi, 6);
+        v_src[7] = vextq_u16(src_long, src_long_hi, 7);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+      } else if (filter_index == 3) {
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+      } else {  // filter_index > 3
+        v_src[0] = src_long;
+        v_src[1] = vextq_u16(src_long, src_long_hi, 1);
+        v_src[2] = vextq_u16(src_long, src_long_hi, 2);
+        v_src[3] = vextq_u16(src_long, src_long_hi, 3);
+        v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+      }
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(&dest16[x],
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(&dest16[x + 4],
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        // Normally the Horizontal pass does the downshift in two passes:
+        // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+        // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+        // Combining them requires adding the rounding offset from the skipped
+        // shift.
+        const int32x4_t v_first_shift_rounding_bit =
+            vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+        v_sum.val[0] = vaddq_s32(v_sum.val[0], v_first_shift_rounding_bit);
+        v_sum.val[1] = vaddq_s32(v_sum.val[1], v_first_shift_rounding_bit);
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(&dest16[x], d0);
+        vst1_u16(&dest16[x + 4], d1);
+      }
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_compound, bool is_2d>
+void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height;
+  do {
+    const uint16x8_t v_zero = vdupq_n_u16(0);
+    uint16x4_t v_src[4];
+    int32x4_t v_sum;
+    const uint16x8_t src_long = vld1q_u16(src);
+    v_src[0] = vget_low_u16(src_long);
+    if (filter_index == 3) {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
+      v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
+      v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
+      v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+    }
+    if (is_compound || is_2d) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      if (is_compound && !is_2d) {
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(
+                                 vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+      } else {
+        vst1_u16(&dest16[0], vreinterpret_u16_s16(d0));
+      }
+    } else {
+      const int32x4_t v_first_shift_rounding_bit =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+      v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(&dest16[0], d0);
+    }
+    src += src_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const int16x4_t* const v_tap) {
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  int y = height >> 1;
+  do {
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t input0 = vreinterpretq_s16_u16(vld1q_u16(src));
+    const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
+    const int16x8x2_t input = vzipq_s16(input0, input1);
+    int32x4_t v_sum;
+    if (filter_index == 3) {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
+                        v_tap[4]);
+    } else {
+      v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[2]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 2)),
+                        v_tap[3]);
+      v_sum = vmlal_s16(v_sum, vget_low_s16(vextq_s16(input.val[0], v_zero, 4)),
+                        v_tap[4]);
+      v_sum = vmlal_s16(v_sum,
+                        vget_low_s16(vextq_s16(input.val[0], input.val[1], 6)),
+                        v_tap[5]);
+    }
+    if (is_2d) {
+      const uint16x4_t d0 = vreinterpret_u16_s16(
+          vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vget_lane_u16(d0, 0);
+      dest16[1] = vget_lane_u16(d0, 2);
+      dest16 += pred_stride;
+      dest16[0] = vget_lane_u16(d0, 1);
+      dest16[1] = vget_lane_u16(d0, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      const int32x4_t v_first_shift_rounding_bit =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 2));
+      v_sum = vaddq_s32(v_sum, v_first_shift_rounding_bit);
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      dest16[0] = vget_lane_u16(d0, 0);
+      dest16[1] = vget_lane_u16(d0, 2);
+      dest16 += pred_stride;
+      dest16[0] = vget_lane_u16(d0, 1);
+      dest16[1] = vget_lane_u16(d0, 3);
+      dest16 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
+    int32x4_t v_sum;
+    if (filter_index == 3) {
+      v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
+    } else {
+      v_sum = vmull_s16(vget_low_s16(input), v_tap[2]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[3]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 2)), v_tap[4]);
+      v_sum =
+          vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 3)), v_tap[5]);
+    }
+    const uint16x4_t d0 = vreinterpret_u16_s16(
+        vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, d0);
+  }
+}
+
+template <int filter_index, bool is_compound, bool is_2d>
+void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const int16x4_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>(
+        src, src_stride, dest, pred_stride, width, height, v_tap);
+    return;
+  }
+
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
+    if (width == 4) {
+      FilterHorizontalWidth4<filter_index, is_compound, is_2d>(
+          src, src_stride, dest, pred_stride, height, v_tap);
+      return;
+    }
+    assert(width == 2);
+    if (!is_compound) {
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
+    }
+  }
+}
+
+template <bool is_compound = false, bool is_2d = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  int16x4_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  if (filter_index == 2) {  // 8 tap.
+    FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride,
+                                            width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst,
+                                            dst_stride, width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst,
+                                            dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
+                                            dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst,
+                                            dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst,
+                                            dst_stride, width, height, v_tap);
+  }
+}
+
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  DoHorizontalPass(src, src_stride, dest, dst_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* const src =
+      static_cast<const uint16_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+
+  DoHorizontalPass</*is_compound=*/true>(src, src_stride, dest, width, width,
+                                         height, horizontal_filter_id,
+                                         filter_index);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint16_t* src_x = src + x;
+    uint16x8_t srcs[8];
+    srcs[0] = vld1q_u16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1q_u16(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1q_u16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1q_u16(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1q_u16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1q_u16(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1q_u16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
+    int y = 0;
+    do {
+      srcs[next_row] = vld1q_u16(src_x);
+      src_x += src_stride;
+
+      const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+      if (is_compound) {
+        const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
+        const int16x4_t d0 =
+            vqrshrn_n_s32(v_sum.val[0], kInterRoundBitsHorizontal - 1);
+        const int16x4_t d1 =
+            vqrshrn_n_s32(v_sum.val[1], kInterRoundBitsHorizontal - 1);
+        vst1_u16(dst16 + x + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d0, v_compound_offset)));
+        vst1_u16(dst16 + x + 4 + y * dst_stride,
+                 vreinterpret_u16_s16(vadd_s16(d1, v_compound_offset)));
+      } else {
+        const uint16x4_t d0 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[0], kFilterBits - 1), v_max_bitdepth);
+        const uint16x4_t d1 = vmin_u16(
+            vqrshrun_n_s32(v_sum.val[1], kFilterBits - 1), v_max_bitdepth);
+        vst1_u16(dst16 + x + y * dst_stride, d0);
+        vst1_u16(dst16 + x + 4 + y * dst_stride, d1);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint16x4_t srcs[9];
+  srcs[0] = vld1_u16(src);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = vld1_u16(src);
+    src += src_stride;
+    srcs[2] = vld1_u16(src);
+    src += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = vld1_u16(src);
+      src += src_stride;
+      srcs[4] = vld1_u16(src);
+      src += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = vld1_u16(src);
+        src += src_stride;
+        srcs[6] = vld1_u16(src);
+        src += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = vld1_u16(src);
+    src += src_stride;
+    srcs[num_taps] = vld1_u16(src);
+    src += src_stride;
+
+    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+    const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
+    if (is_compound) {
+      const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
+      const int16x4_t d1 =
+          vqrshrn_n_s32(v_sum_1, kInterRoundBitsHorizontal - 1);
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d0, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+      vst1_u16(dst16,
+               vreinterpret_u16_s16(vadd_s16(d1, vdup_n_s16(kCompoundOffset))));
+      dst16 += dst_stride;
+    } else {
+      const uint16x4_t d0 =
+          vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+      const uint16x4_t d1 =
+          vmin_u16(vqrshrun_n_s32(v_sum_1, kFilterBits - 1), v_max_bitdepth);
+      vst1_u16(dst16, d0);
+      dst16 += dst_stride;
+      vst1_u16(dst16, d1);
+      dst16 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index>
+void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const int16x4_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  const uint16x4_t v_zero = vdup_n_u16(0);
+
+  uint16x4_t srcs[9];
+  srcs[0] = Load2<0>(src, v_zero);
+  src += src_stride;
+  if (num_taps >= 4) {
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[1] = vext_u16(srcs[0], srcs[2], 2);
+    if (num_taps >= 6) {
+      srcs[2] = Load2<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, v_zero);
+      src += src_stride;
+      srcs[3] = vext_u16(srcs[2], srcs[4], 2);
+      if (num_taps == 8) {
+        srcs[4] = Load2<1>(src, srcs[4]);
+        src += src_stride;
+        srcs[6] = Load2<0>(src, v_zero);
+        src += src_stride;
+        srcs[5] = vext_u16(srcs[4], srcs[6], 2);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = Load2<1>(src, srcs[next_row - 1]);
+    src += src_stride;
+    srcs[num_taps] = Load2<0>(src, v_zero);
+    src += src_stride;
+    srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
+
+    const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+    const uint16x4_t d0 =
+        vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
+    Store2<0>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<1>(dst16, d0);
+    dst16 += dst_stride;
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+  }
+
+  if (is_compound) {
+    // Output is compound, so leave signed and do not saturate. Offset will
+    // accurately bring the value back into positive range.
+    return vcombine_s16(
+        vrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+  }
+
+  // Output is pixel, so saturate to clip at 0.
+  return vreinterpretq_s16_u16(
+      vcombine_u16(vqrshrun_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                   vqrshrun_n_s32(sum_hi, kInterRoundBitsVertical - 1)));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const int16_t* LIBGAV1_RESTRICT src,
+                                 void* LIBGAV1_RESTRICT const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    int16x8_t srcs[9];
+    srcs[0] = vld1q_s16(src);
+    src += 8;
+    if (num_taps >= 4) {
+      srcs[1] = vld1q_s16(src);
+      src += 8;
+      srcs[2] = vld1q_s16(src);
+      src += 8;
+      if (num_taps >= 6) {
+        srcs[3] = vld1q_s16(src);
+        src += 8;
+        srcs[4] = vld1q_s16(src);
+        src += 8;
+        if (num_taps == 8) {
+          srcs[5] = vld1q_s16(src);
+          src += 8;
+          srcs[6] = vld1q_s16(src);
+          src += 8;
+        }
+      }
+    }
+
+    uint16_t* d16 = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = vld1q_s16(src);
+      src += 8;
+      srcs[next_row + 1] = vld1q_s16(src);
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+      if (is_compound) {
+        const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+        vst1q_u16(d16,
+                  vreinterpretq_u16_s16(vaddq_s16(sum0, v_compound_offset)));
+        d16 += dst_stride;
+        vst1q_u16(d16,
+                  vreinterpretq_u16_s16(vaddq_s16(sum1, v_compound_offset)));
+        d16 += dst_stride;
+      } else {
+        vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum0), v_max_bitdepth));
+        d16 += dst_stride;
+        vst1q_u16(d16, vminq_u16(vreinterpretq_u16_s16(sum1), v_max_bitdepth));
+        d16 += dst_stride;
+      }
+      srcs[0] = srcs[2];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
+          if (num_taps == 8) {
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
+          }
+        }
+      }
+      y -= 2;
+    } while (y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const int16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vld1q_s16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = vld1q_s16(src);
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+    if (num_taps >= 6) {
+      srcs[4] = vld1q_s16(src);
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+      if (num_taps == 8) {
+        srcs[6] = vld1q_s16(src);
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = vld1q_s16(src);
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      const int16x8_t v_compound_offset = vdupq_n_s16(kCompoundOffset);
+      vst1q_u16(dst16,
+                vreinterpretq_u16_s16(vaddq_s16(sum, v_compound_offset)));
+      dst16 += 4 << 1;
+    } else {
+      const uint16x8_t d0 =
+          vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+      vst1_u16(dst16, vget_low_u16(d0));
+      dst16 += dst_stride;
+      vst1_u16(dst16, vget_high_u16(d0));
+      dst16 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const int16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vld1q_s16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vld1q_s16(src);
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = vld1q_s16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint16x8_t d0 = vminq_u16(vreinterpretq_u16_s16(sum), v_max_bitdepth);
+    Store2<0>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<1>(dst16, d0);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst16 += dst_stride;
+    Store2<2>(dst16, d0);
+    dst16 += dst_stride;
+    Store2<3>(dst16, d0);
+    dst16 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int vertical_taps>
+void Filter2DVertical(const int16_t* LIBGAV1_RESTRICT const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* LIBGAV1_RESTRICT const prediction,
+                      const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT const prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x43, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* const src = static_cast<const uint16_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+  const ptrdiff_t dest_stride = pred_stride >> 1;
+
+  DoHorizontalPass</*is_compound=*/false, /*is_2d=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else if (vertical_taps == 6) {
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else if (vertical_taps == 4) {
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  } else {  // |vertical_taps| == 2
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        dest_stride);
+  }
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+    const int16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
+void ConvolveCompound2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t
+      intermediate_result[(kMaxSuperBlockSizeInPixels *
+                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1))];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* const src = static_cast<const uint16_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 6) {
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 4) {
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+  } else {  // |vertical_taps| == 2
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+  }
+}
+
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride >> 1;
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 0 || vertical_filter_id == 2 ||
+             vertical_filter_id == 3 || vertical_filter_id == 4 ||
+             vertical_filter_id == 5 || vertical_filter_id == 6 ||
+             vertical_filter_id == 10 || vertical_filter_id == 11 ||
+             vertical_filter_id == 12 || vertical_filter_id == 13 ||
+             vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const auto* src = static_cast<const uint16_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  int16x4_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_s16(kHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 || filter_index == 4 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint16_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  const uint16x8_t offset =
+      vdupq_n_u16((1 << kBitdepth10) + (1 << (kBitdepth10 - 1)));
+
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      int w = width;
+      do {
+        const uint16x8_t v_src_lo = vld1q_u16(&src[x]);
+        const uint16x8_t v_src_hi = vld1q_u16(&src[x + 8]);
+        const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+        const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+        const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+        const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        vst1q_u16(&dest[x + 8], v_dest_hi);
+        x += 16;
+        w -= 16;
+      } while (w != 0);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x8_t v_src_lo = vld1q_u16(&src[0]);
+      const uint16x8_t v_src_hi = vld1q_u16(&src[src_stride]);
+      const uint16x8_t v_sum_lo = vaddq_u16(v_src_lo, offset);
+      const uint16x8_t v_sum_hi = vaddq_u16(v_src_hi, offset);
+      const uint16x8_t v_dest_lo = vshlq_n_u16(v_sum_lo, final_shift);
+      const uint16x8_t v_dest_hi = vshlq_n_u16(v_sum_hi, final_shift);
+      vst1q_u16(&dest[0], v_dest_lo);
+      vst1q_u16(&dest[8], v_dest_hi);
+      src += src_stride << 1;
+      dest += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      const uint16x4_t v_src_lo = vld1_u16(&src[0]);
+      const uint16x4_t v_src_hi = vld1_u16(&src[src_stride]);
+      const uint16x4_t v_sum_lo = vadd_u16(v_src_lo, vget_low_u16(offset));
+      const uint16x4_t v_sum_hi = vadd_u16(v_src_hi, vget_low_u16(offset));
+      const uint16x4_t v_dest_lo = vshl_n_u16(v_sum_lo, final_shift);
+      const uint16x4_t v_dest_hi = vshl_n_u16(v_sum_hi, final_shift);
+      vst1_u16(&dest[0], v_dest_lo);
+      vst1_u16(&dest[4], v_dest_hi);
+      src += src_stride << 1;
+      dest += 8;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+inline void HalfAddHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
+                              uint16_t* LIBGAV1_RESTRICT const dst) {
+  const uint16x8_t left = vld1q_u16(src);
+  const uint16x8_t right = vld1q_u16(src + 1);
+  vst1q_u16(dst, vrhaddq_u16(left, right));
+}
+
+inline void HalfAddHorizontal16(const uint16_t* LIBGAV1_RESTRICT const src,
+                                uint16_t* LIBGAV1_RESTRICT const dst) {
+  HalfAddHorizontal(src, dst);
+  HalfAddHorizontal(src + 8, dst + 8);
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint16_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint16_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal16(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal16(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal16(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal16(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal16(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      HalfAddHorizontal(src, dest);
+      src += src_stride;
+      dest += dst_stride;
+    } while (--y != 0);
+  } else {  // width == 4
+    int y = height;
+    do {
+      uint16x4x2_t left;
+      uint16x4x2_t right;
+      left.val[0] = vld1_u16(src);
+      right.val[0] = vld1_u16(src + 1);
+      src += src_stride;
+      left.val[1] = vld1_u16(src);
+      right.val[1] = vld1_u16(src + 1);
+      src += src_stride;
+
+      vst1_u16(dest, vrhadd_u16(left.val[0], right.val[0]));
+      dest += dst_stride;
+      vst1_u16(dest, vrhadd_u16(left.val[1], right.val[1]));
+      dest += dst_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint16_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint16_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[8], below[8];
+
+  row[0] = vld1q_u16(src);
+  if (width >= 16) {
+    src += 8;
+    row[1] = vld1q_u16(src);
+    if (width >= 32) {
+      src += 8;
+      row[2] = vld1q_u16(src);
+      src += 8;
+      row[3] = vld1q_u16(src);
+      if (width == 64) {
+        src += 8;
+        row[4] = vld1q_u16(src);
+        src += 8;
+        row[5] = vld1q_u16(src);
+        src += 8;
+        row[6] = vld1q_u16(src);
+        src += 8;
+        row[7] = vld1q_u16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = vld1q_u16(src);
+    if (width >= 16) {
+      src += 8;
+      below[1] = vld1q_u16(src);
+      if (width >= 32) {
+        src += 8;
+        below[2] = vld1q_u16(src);
+        src += 8;
+        below[3] = vld1q_u16(src);
+        if (width == 64) {
+          src += 8;
+          below[4] = vld1q_u16(src);
+          src += 8;
+          below[5] = vld1q_u16(src);
+          src += 8;
+          below[6] = vld1q_u16(src);
+          src += 8;
+          below[7] = vld1q_u16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u16(dst, vrhaddq_u16(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 16) {
+      dst += 8;
+      vst1q_u16(dst, vrhaddq_u16(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 32) {
+        dst += 8;
+        vst1q_u16(dst, vrhaddq_u16(row[2], below[2]));
+        row[2] = below[2];
+        dst += 8;
+        vst1q_u16(dst, vrhaddq_u16(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 64) {
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[4], below[4]));
+          row[4] = below[4];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[5], below[5]));
+          row[5] = below[5];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[6], below[6]));
+          row[6] = below[6];
+          dst += 8;
+          vst1q_u16(dst, vrhaddq_u16(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  if (width == 128) {
+    // Due to register pressure, process two 64xH.
+    for (int i = 0; i < 2; ++i) {
+      IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+      src += 64;
+      dest += 64;
+    }
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    IntraBlockCopyVertical<8>(src, src_stride, height, dest, dst_stride);
+  } else {  // width == 4
+    uint16x4_t row = vld1_u16(src);
+    src += src_stride;
+    int y = height;
+    do {
+      const uint16x4_t below = vld1_u16(src);
+      src += src_stride;
+      vst1_u16(dest, vrhadd_u16(row, below));
+      dest += dst_stride;
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint16_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint16_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      src += 8;
+      row[3] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[5] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[6] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        src += 8;
+        row[7] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[9] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[10] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[11] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[12] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[13] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[14] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          src += 8;
+          row[15] = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const uint16x8_t below_0 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+    vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+      vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 =
+            vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 =
+            vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+        vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 =
+              vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+          vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddq_u16(vld1q_u16(src), vld1q_u16(src + 1));
+            vst1q_u16(dst, vrshrq_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint16_t*>(reference);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const ptrdiff_t src_stride = reference_stride >> 1;
+  const ptrdiff_t dst_stride = pred_stride >> 1;
+
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, src_stride, height, dest, dst_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, src_stride, height, dest, dst_stride);
+  } else {  // width == 4
+    uint16x4_t row0 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+    src += src_stride;
+
+    int y = height;
+    do {
+      const uint16x4_t row1 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+      src += src_stride;
+      const uint16x4_t row2 = vadd_u16(vld1_u16(src), vld1_u16(src + 1));
+      src += src_stride;
+      const uint16x4_t result_01 = vrshr_n_u16(vadd_u16(row0, row1), 2);
+      const uint16x4_t result_12 = vrshr_n_u16(vadd_u16(row1, row2), 2);
+      vst1_u16(dest, result_01);
+      dest += dst_stride;
+      vst1_u16(dest, result_12);
+      dest += dst_stride;
+      row0 = row2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Scaled Convolve
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x16x3_t LoadSrcVals(const uint16_t* const src_x) {
+  uint8x16x3_t ret;
+  // When fractional step size is less than or equal to 1, the rightmost
+  // starting value for a filter may be at position 7. For an 8-tap filter, the
+  // rightmost value for the final tap may be at position 14. Therefore we load
+  // 2 vectors of eight 16-bit values.
+  ret.val[0] = vreinterpretq_u8_u16(vld1q_u16(src_x));
+  ret.val[1] = vreinterpretq_u8_u16(vld1q_u16(src_x + 8));
+#if LIBGAV1_MSAN
+  // Initialize to quiet msan warnings when grade_x <= 1.
+  ret.val[2] = vdupq_n_u8(0);
+#endif
+  if (grade_x > 1) {
+    // When fractional step size is greater than 1 (up to 2), the rightmost
+    // starting value for a filter may be at position 15. For an 8-tap filter,
+    // the rightmost value for the final tap may be at position 22. Therefore we
+    // load 3 vectors of eight 16-bit values.
+    ret.val[2] = vreinterpretq_u8_u16(vld1q_u16(src_x + 16));
+  }
+  return ret;
+}
+
+// Assemble 4 values corresponding to one tap position across multiple filters.
+// This is a simple case because maximum offset is 8 and only smaller filters
+// work on 4xH.
+inline uint16x4_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+                                 const uint8x8_t indices) {
+  const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+  return vreinterpret_u16_u8(VQTbl2U8(src_bytes2, indices));
+}
+
+// Assemble 8 values corresponding to one tap position across multiple filters.
+// This requires a lot of workaround on A32 architectures, so it may be worth
+// using an overall different algorithm for that architecture.
+template <int grade_x>
+inline uint16x8_t PermuteSrcVals(const uint8x16x3_t src_bytes,
+                                 const uint8x16_t indices) {
+  if (grade_x == 1) {
+    const uint8x16x2_t src_bytes2 = {src_bytes.val[0], src_bytes.val[1]};
+    return vreinterpretq_u16_u8(VQTbl2QU8(src_bytes2, indices));
+  }
+  return vreinterpretq_u16_u8(VQTbl3QU8(src_bytes, indices));
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+// Although the taps need to be converted to 16-bit values, they must be
+// arranged by table lookup, which is more expensive for larger types than
+// lengthening in-loop. |tap_index| refers to the index within a kernel applied
+// to a single value.
+inline int8x16_t GetPositive2TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+  return vld1q_s8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+  const int kernel_offset = 3;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  const int8x16_t filter_taps0 = GetPositive2TapFilter(0);
+  const int8x16_t filter_taps1 = GetPositive2TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  if (width <= 4) {
+    const uint16_t* src_y = src;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+    // Each lane of lane of taps[k] corresponds to one output value along the
+    // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+    // filter_id depends on x.
+    const int16x4_t taps[2] = {
+        vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+        vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices)))};
+    // Lower byte of Nth value is at position 2*N.
+    // Narrowing shift is not available here because the maximum shift
+    // parameter is 8.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    // Only 4 values needed.
+    const uint8x8_t src_indices = InterleaveLow8(src_indices0, src_indices1);
+    const uint8x8_t src_lookup[2] = {src_indices,
+                                     vadd_u8(src_indices, vdup_n_u8(2))};
+
+    int y = intermediate_height;
+    do {
+      const uint16_t* src_x =
+          src_y + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_x);
+      // Each lane corresponds to a different filter kernel.
+      const uint16x4_t src[2] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                                 PermuteSrcVals(src_bytes, src_lookup[1])};
+
+      vst1_s16(intermediate,
+               vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                            kInterRoundBitsHorizontal - 1));
+      src_y = AddByteStride(src_y, src_stride);
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    return;
+  }
+
+  // |width| >= 8
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // Each lane of lane of taps[k] corresponds to one output value along the
+    // row, containing kSubPixelFilters[filter_index][filter_id][k], where
+    // filter_id depends on x.
+    const int16x8_t taps[2] = {
+        vmovl_s8(VQTbl1S8(filter_taps0, filter_indices)),
+        vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))};
+    const int16x4_t taps_low[2] = {vget_low_s16(taps[0]),
+                                   vget_low_s16(taps[1])};
+    const int16x4_t taps_high[2] = {vget_high_s16(taps[0]),
+                                    vget_high_s16(taps[1])};
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+    const uint8x16_t src_lookup[2] = {src_indices,
+                                      vaddq_u8(src_indices, vdupq_n_u8(2))};
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+      // Each lane corresponds to a different filter kernel.
+      const uint16x8_t src[2] = {
+          PermuteSrcVals<grade_x>(src_bytes, src_lookup[0]),
+          PermuteSrcVals<grade_x>(src_bytes, src_lookup[1])};
+      const uint16x4_t src_low[2] = {vget_low_u16(src[0]),
+                                     vget_low_u16(src[1])};
+      const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
+                                      vget_high_u16(src[1])};
+
+      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(
+                                                src_low, taps_low),
+                                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(
+          intermediate_x + 4,
+          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high),
+                       kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline int8x16_t GetPositive4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(
+      16) static constexpr int8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+  return vld1q_s8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalPositive4Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps in the spec.
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int8x16_t filter_taps0 = GetPositive4TapFilter(0);
+  const int8x16_t filter_taps1 = GetPositive4TapFilter(1);
+  const int8x16_t filter_taps2 = GetPositive4TapFilter(2);
+  const int8x16_t filter_taps3 = GetPositive4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices =
+      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+  // Each lane of lane of taps[k] corresponds to one output value along the row,
+  // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+  // depends on x.
+  const int16x4_t taps[4] = {
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+  // Lower byte of Nth value is at position 2*N.
+  // Narrowing shift is not available here because the maximum shift
+  // parameter is 8.
+  const uint8x8_t src_indices0 = vshl_n_u8(
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+  // Upper byte of Nth value is at position 2*N+1.
+  const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+  // Only 4 values needed.
+  const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+  uint8x8_t src_lookup[4];
+  const uint8x8_t two = vdup_n_u8(2);
+  src_lookup[0] = src_indices_base;
+  for (int i = 1; i < 4; ++i) {
+    src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+  }
+
+  const uint16_t* src_y =
+      src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+    // Each lane corresponds to a different filter kernel.
+    const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                               PermuteSrcVals(src_bytes, src_lookup[1]),
+                               PermuteSrcVals(src_bytes, src_lookup[2]),
+                               PermuteSrcVals(src_bytes, src_lookup[3])};
+
+    vst1_s16(intermediate,
+             vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps),
+                          kInterRoundBitsHorizontal - 1));
+    src_y = AddByteStride(src_y, src_stride);
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline int8x16_t GetSigned4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {-0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {-0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+
+  return vld1q_s8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int8x16_t filter_taps0 = GetSigned4TapFilter(0);
+  const int8x16_t filter_taps1 = GetSigned4TapFilter(1);
+  const int8x16_t filter_taps2 = GetSigned4TapFilter(2);
+  const int8x16_t filter_taps3 = GetSigned4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  const int p = subpixel_x;
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices =
+      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+  // Each lane of lane of taps[k] corresponds to one output value along the row,
+  // containing kSubPixelFilters[filter_index][filter_id][k], where filter_id
+  // depends on x.
+  const int16x4_t taps[4] = {
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps0, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps1, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps2, filter_indices))),
+      vget_low_s16(vmovl_s8(VQTbl1S8(filter_taps3, filter_indices)))};
+  // Lower byte of Nth value is at position 2*N.
+  // Narrowing shift is not available here because the maximum shift
+  // parameter is 8.
+  const uint8x8_t src_indices0 = vshl_n_u8(
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+  // Upper byte of Nth value is at position 2*N+1.
+  const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+  // Only 4 values needed.
+  const uint8x8_t src_indices_base = InterleaveLow8(src_indices0, src_indices1);
+
+  uint8x8_t src_lookup[4];
+  const uint8x8_t two = vdup_n_u8(2);
+  src_lookup[0] = src_indices_base;
+  for (int i = 1; i < 4; ++i) {
+    src_lookup[i] = vadd_u8(src_lookup[i - 1], two);
+  }
+
+  const uint16_t* src_y =
+      src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16x3_t src_bytes = LoadSrcVals<1>(src_y);
+    // Each lane corresponds to a different filter kernel.
+    const uint16x4_t src[4] = {PermuteSrcVals(src_bytes, src_lookup[0]),
+                               PermuteSrcVals(src_bytes, src_lookup[1]),
+                               PermuteSrcVals(src_bytes, src_lookup[2]),
+                               PermuteSrcVals(src_bytes, src_lookup[3])};
+
+    vst1_s16(intermediate,
+             vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps),
+                          kInterRoundBitsHorizontal - 1));
+    src_y = AddByteStride(src_y, src_stride);
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline int8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {-0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {-0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_s8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[6];
+    int16x4_t taps_high[6];
+    for (int i = 0; i < 6; ++i) {
+      const int16x8_t taps_i =
+          vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps_i);
+      taps_high[i] = vget_high_s16(taps_i);
+    }
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[6];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[6];
+      uint16x4_t src_high[6];
+      for (int i = 0; i < 6; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
+                                                src_low, taps_low),
+                                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(
+          intermediate_x + 4,
+          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
+                       kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps depending on the filter id.
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel6TapMixedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kAbsHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetMixed6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x =
+        src + (p >> kScaleSubPixelBits) - ref_x + kernel_offset;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[6];
+    int16x4_t taps_high[6];
+    for (int i = 0; i < 6; ++i) {
+      const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps);
+      taps_high[i] = vget_high_s16(taps);
+    }
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[6];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[6];
+      uint16x4_t src_high[6];
+      for (int i = 0; i < 6; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
+                                                src_low, taps_low),
+                                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(
+          intermediate_x + 4,
+          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
+                       kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline int8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr int8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {-0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {-0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3,
+           -1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {-0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6,
+           -3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {-0, -0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+
+  return vld1q_s8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint16_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  int8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint16_t* src_x = src + (p >> kScaleSubPixelBits) - ref_x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+
+    // Lower byte of Nth value is at position 2*N.
+    const uint8x8_t src_indices0 = vshl_n_u8(
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)), 1);
+    // Upper byte of Nth value is at position 2*N+1.
+    const uint8x8_t src_indices1 = vadd_u8(src_indices0, vdup_n_u8(1));
+    const uint8x8x2_t src_indices_zip = vzip_u8(src_indices0, src_indices1);
+    const uint8x16_t src_indices_base =
+        vcombine_u8(src_indices_zip.val[0], src_indices_zip.val[1]);
+
+    uint8x16_t src_lookup[8];
+    const uint8x16_t two = vdupq_n_u8(2);
+    src_lookup[0] = src_indices_base;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vaddq_u8(src_lookup[i - 1], two);
+    }
+    // Each lane of lane of taps_(low|high)[k] corresponds to one output value
+    // along the row, containing kSubPixelFilters[filter_index][filter_id][k],
+    // where filter_id depends on x.
+    int16x4_t taps_low[8];
+    int16x4_t taps_high[8];
+    for (int i = 0; i < 8; ++i) {
+      const int16x8_t taps = vmovl_s8(VQTbl1S8(filter_taps[i], filter_indices));
+      taps_low[i] = vget_low_s16(taps);
+      taps_high[i] = vget_high_s16(taps);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16x3_t src_bytes = LoadSrcVals<grade_x>(src_x);
+
+      uint16x4_t src_low[8];
+      uint16x4_t src_high[8];
+      for (int i = 0; i < 8; ++i) {
+        const uint16x8_t src_i =
+            PermuteSrcVals<grade_x>(src_bytes, src_lookup[i]);
+        src_low[i] = vget_low_u16(src_i);
+        src_high[i] = vget_high_u16(src_i);
+      }
+
+      vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(
+                                                src_low, taps_low),
+                                            kInterRoundBitsHorizontal - 1));
+      vst1_s16(
+          intermediate_x + 4,
+          vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high),
+                       kInterRoundBitsHorizontal - 1));
+      // Avoid right shifting the stride.
+      src_x = AddByteStride(src_x, src_stride);
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum;
+  if (num_taps == 8) {
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vreinterpret_s16_u16(vqrshrun_n_s32(sum, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale2Or4xH(const int16_t* LIBGAV1_RESTRICT const src,
+                                 const int subpixel_y, const int filter_index,
+                                 const int step_y, const int height,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t dest_stride) {
+  static_assert(width == 2 || width == 4, "");
+  // We increment stride with the 8-bit pointer and then reinterpret to avoid
+  // shifting |dest_stride|.
+  auto* dest_y = static_cast<uint16_t*>(dest);
+  // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+  // than bytes.
+  auto* compound_dest_y = static_cast<uint16_t*>(dest);
+  // This stride always corresponds to int16_t.
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = height;
+  do {
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      // This offset potentially overflows into the sign bit, but should yield
+      // the correct unsigned value.
+      const uint16x4_t result =
+          vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+      vst1_u16(compound_dest_y, result);
+      compound_dest_y += dest_stride;
+    } else {
+      const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+                                         vdup_n_u16((1 << kBitdepth10) - 1));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        vst1_u16(dest_y, result);
+      }
+      dest_y = AddByteStride(dest_y, dest_stride);
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result =
+          vreinterpret_u16_s16(vadd_s16(sums, vdup_n_s16(kCompoundOffset)));
+      vst1_u16(compound_dest_y, result);
+      compound_dest_y += dest_stride;
+    } else {
+      const uint16x4_t result = vmin_u16(vreinterpret_u16_s16(sums),
+                                         vdup_n_u16((1 << kBitdepth10) - 1));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        vst1_u16(dest_y, result);
+      }
+      dest_y = AddByteStride(dest_y, dest_stride);
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+                           const int intermediate_height, const int width,
+                           const int subpixel_y, const int filter_index,
+                           const int step_y, const int height,
+                           void* LIBGAV1_RESTRICT const dest,
+                           const ptrdiff_t dest_stride) {
+  // This stride always corresponds to int16_t.
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+
+  int16x8_t s[num_taps + 2];
+
+  const int16_t* src = source;
+  int x = 0;
+  do {
+    const int16_t* src_y = src;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    // We increment stride with the 8-bit pointer and then reinterpret to avoid
+    // shifting |dest_stride|.
+    auto* dest_y = static_cast<uint16_t*>(dest) + x;
+    // In compound mode, |dest_stride| is based on the size of uint16_t, rather
+    // than bytes.
+    auto* compound_dest_y = static_cast<uint16_t*>(dest) + x;
+    int y = height;
+    do {
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sums =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        // This offset potentially overflows int16_t, but should yield the
+        // correct unsigned value.
+        const uint16x8_t result = vreinterpretq_u16_s16(
+            vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+        vst1q_u16(compound_dest_y, result);
+        compound_dest_y += dest_stride;
+      } else {
+        const uint16x8_t result = vminq_u16(
+            vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+        vst1q_u16(dest_y, result);
+        dest_y = AddByteStride(dest_y, dest_stride);
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      prev_p = p;
+      // Here we load extra source in case it is needed. If |p_diff| == 0, these
+      // values will be unused, but it's faster to load than to branch.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sums = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        assert(width != 2);
+        const uint16x8_t result = vreinterpretq_u16_s16(
+            vaddq_s16(sums, vdupq_n_s16(kCompoundOffset)));
+        vst1q_u16(compound_dest_y, result);
+        compound_dest_y += dest_stride;
+      } else {
+        const uint16x8_t result = vminq_u16(
+            vreinterpretq_u16_s16(sums), vdupq_n_u16((1 << kBitdepth10) - 1));
+        vst1q_u16(dest_y, result);
+        dest_y = AddByteStride(dest_y, dest_stride);
+      }
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+
+      y -= 2;
+    } while (y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* LIBGAV1_RESTRICT const prediction,
+                          const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  assert(step_y <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+  int16_t intermediate_result[kIntermediateAllocWidth *
+                              (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x54, sizeof(intermediate_result));
+#endif
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // The same applies to height and vertical filter index.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint16_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src = AddByteStride(src, vert_kernel_offset * src_stride);
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte (8-value) load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+
+  switch (filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned6Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
+      } else {
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned8Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+                                         intermediate_height, intermediate);
+      break;
+    default:
+      assert(filter_index == 5);
+      ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+                                           intermediate_height, intermediate);
+  }
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale2Or4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale2Or4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+  dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+}  // namespace
+
+void ConvolveInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
new file mode 100644
index 0000000..5b80da2
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.cc
@@ -0,0 +1,3099 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+                         const uint8x8_t* const taps) {
+  uint16x8_t sum;
+  if (filter_index == 0) {
+    // 6 taps. + - + + - +
+    sum = vmull_u8(src[0], taps[0]);
+    // Unsigned overflow will result in a valid int16_t value.
+    sum = vmlsl_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlsl_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1 && negative_outside_taps) {
+    // 6 taps. - + + + + -
+    // Set a base we can subtract from.
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1) {
+    // 6 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps. - + - + + - + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlsl_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+    sum = vmlal_u8(sum, src[6], taps[6]);
+    sum = vmlsl_u8(sum, src[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+  } else if (filter_index == 4) {
+    // 4 taps. - + + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlsl_u8(sum, src[3], taps[3]);
+  } else if (filter_index == 5) {
+    // 4 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+  }
+  return vreinterpretq_s16_u16(sum);
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const ptrdiff_t src_stride,
+                                 void* LIBGAV1_RESTRICT const dest,
+                                 const ptrdiff_t pred_stride, const int width,
+                                 const int height,
+                                 const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  if (!is_2d) {
+    int y = height;
+    do {
+      int x = 0;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+          vst1q_u16(&dest16[x], v_sum);
+        } else {
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+          vst1_u8(&dest8[x], result);
+        }
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* LIBGAV1_RESTRICT src,
+                            const ptrdiff_t src_stride,
+                            void* LIBGAV1_RESTRICT const dest,
+                            const ptrdiff_t pred_stride, const int height,
+                            const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
+    return;
+  }
+
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
+    if (width == 4) {
+      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
+      return;
+    }
+    assert(width == 2);
+    if (!is_compound) {
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
+    }
+  }
+}
+
+// Process 16 bit inputs and output 32 bits.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum;
+  if (num_taps == 8) {
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
+
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vcombine_s16(
+        vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                      vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
+                                 void* LIBGAV1_RESTRICT const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps >= 4) {
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      if (num_taps >= 6) {
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        if (num_taps == 8) {
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+        }
+      }
+    }
+
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
+      if (is_compound) {
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
+      } else {
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
+      }
+      srcs[0] = srcs[2];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
+          if (num_taps == 8) {
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
+          }
+        }
+      }
+      y -= 2;
+    } while (y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVerticalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
+    if (num_taps >= 6) {
+      srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
+      if (num_taps == 8) {
+        srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      const uint16x8_t results = vreinterpretq_u16_s16(sum);
+      vst1q_u16(dst16, results);
+      dst16 += 4 << 1;
+    } else {
+      const uint8x8_t results = vqmovun_s16(sum);
+
+      StoreLo4(dst8, results);
+      dst8 += dst_stride;
+      StoreHi4(dst8, results);
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVerticalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
+                            void* LIBGAV1_RESTRICT const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint8x8_t results = vqmovun_s16(sum);
+
+    Store2<0>(dst8, results);
+    dst8 += dst_stride;
+    Store2<1>(dst8, results);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2<2>(dst8, results);
+    dst8 += dst_stride;
+    Store2<3>(dst8, results);
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  // Duplicate the absolute value for each tap.  Negative taps are corrected
+  // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
+  uint8x8_t v_tap[kSubPixelTaps];
+  assert(filter_id != 0);
+
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
+
+  if (filter_index == 2) {  // 8 tap.
+    FilterHorizontal<2, true, is_2d, is_compound>(
+        src, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    // Check if outside taps are positive.
+    if ((filter_id == 1) | (filter_id == 15)) {
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    } else {
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    }
+  } else if (filter_index == 0) {  // 6 tap.
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
+  } else {  // 2 tap.
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+  }
+}
+
+template <int vertical_taps>
+void Filter2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  }
+}
+
+void Convolve2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT const prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x33, sizeof(intermediate_result));
+#endif
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else if (vertical_taps == 6) {
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else if (vertical_taps == 4) {
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  } else {  // |vertical_taps| == 2
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
+  uint8x8x3_t ret;
+  const uint8x16_t src_val = vld1q_u8(src_x);
+  ret.val[0] = vget_low_u8(src_val);
+  ret.val[1] = vget_high_u8(src_val);
+#if LIBGAV1_MSAN
+  // Initialize to quiet msan warnings when grade_x <= 1.
+  ret.val[2] = vdup_n_u8(0);
+#endif
+  if (grade_x > 1) {
+    ret.val[2] = vld1_u8(src_x + 16);
+  }
+  return ret;
+}
+
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
+inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+
+  return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
+}
+
+template <int grade_x>
+inline void ConvolveKernelHorizontal2Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height, int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = 3;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+
+  int p = subpixel_x;
+  if (width <= 4) {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x16_t src_vals = vld1q_u8(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          VQTbl1U8(src_vals, src_indices),
+          VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    return;
+  }
+
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          vtbl3_u8(src_vals, src_indices),
+          vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
+inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+
+  return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+void ConvolveKernelHorizontalPositive4Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetPositive4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const int p = subpixel_x;
+  // First filter is special, just a 128 tap on the center.
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint8x8_t filter_indices = vand_u8(
+      vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices =
+      vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped index vectors.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
+inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
+  assert(tap_index < 4);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
+
+  return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width <= 4.
+inline void ConvolveKernelHorizontalSigned4Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT intermediate) {
+  const int kernel_offset = 2;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const uint8x16_t filter_taps0 = GetSigned4TapFilter(0);
+  const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
+  const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
+  const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
+  const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+                                            static_cast<uint16_t>(step_x));
+
+  const int p = subpixel_x;
+  const uint8_t* src_x =
+      &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+  // Only add steps to the 10-bit truncated p to avoid overflow.
+  const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+  const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+  const uint8x8_t filter_index_offsets = vshrn_n_u16(
+      vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
+  const uint8x8_t filter_indices =
+      vand_u8(filter_index_offsets, filter_index_mask);
+  // Note that filter_id depends on x.
+  // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices_base =
+      vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+  const uint8x8_t src_indices[4] = {src_indices_base,
+                                    vadd_u8(src_indices_base, vdup_n_u8(1)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(2)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+  int y = intermediate_height;
+  do {
+    // Load a pool of samples to select from using stepped indices.
+    const uint8x16_t src_vals = vld1q_u8(src_x);
+
+    // For each x, srcK contains src_x[k] where k=1.
+    // Whereas taps come from different arrays, src pixels are drawn from the
+    // same contiguous line.
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+        VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
+
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
+    src_x += src_stride;
+    intermediate += kIntermediateStride;
+  } while (--y != 0);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    // Avoid overloading outside the reference boundaries. This means
+    // |trailing_width| can be up to 24.
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    uint8x8_t taps[6];
+    for (int i = 0; i < 6; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[6] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+      {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[4];
+  int16x8_t mixed_taps[2];
+  uint8x16_t positive_filter_taps[4];
+  for (int i = 0; i < 4; ++i) {
+    positive_filter_taps[i] = GetPositive6TapFilter(i);
+  }
+  int8x16_t mixed_filter_taps[2];
+  mixed_filter_taps[0] = GetMixed6TapFilter(0);
+  mixed_filter_taps[1] = GetMixed6TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 4; ++i) {
+      taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+    }
+    mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+    mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      int16x8_t sum_mixed = vmulq_s16(
+          mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+      sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+                            ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+      uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+      sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+      sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+      sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+      sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+      vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    const int width, const int subpixel_x, const int step_x,
+    const int intermediate_height,
+    int16_t* LIBGAV1_RESTRICT const intermediate) {
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[8];
+  uint8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[8];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 8; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+
+    int y = intermediate_height;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[8] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+          vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* LIBGAV1_RESTRICT const src,
+                              const int subpixel_y, const int filter_index,
+                              const int step_y, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = height;
+  do {
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT const source,
+                                  const int intermediate_height,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  // A possible improvement is to use arithmetic to decide how many times to
+  // apply filters to same source before checking whether to load new srcs.
+  // However, this will only improve performance with very small step sizes.
+  int16x8_t s[num_taps + grade_y];
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y;
+  uint8_t* dest_y;
+  const int16_t* src = source;
+
+  int x = 0;
+  do {
+    const int16_t* src_y = src;
+    dest16_y = static_cast<uint16_t*>(dest) + x;
+    dest_y = static_cast<uint8_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    int y = height;
+    do {
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+      // needed. Otherwise, we only need to load one vector because |p_diff|
+      // can't exceed 1.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+      y -= 2;
+    } while (y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* LIBGAV1_RESTRICT const prediction,
+                          const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  assert(step_y <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kIntermediateAllocWidth *
+                              (2 * kIntermediateAllocWidth + 8)];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings. Set with random non-zero value to aid in debugging.
+  memset(intermediate_result, 0x44, sizeof(intermediate_result));
+#endif
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned6Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
+      } else {
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      } else {
+        ConvolveKernelHorizontalSigned8Tap<1>(
+            src, src_stride, width, subpixel_x, step_x, intermediate_height,
+            intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x,
+                                         intermediate_height, intermediate);
+      break;
+    default:
+      assert(filter_index == 5);
+      ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x,
+                                           intermediate_height, intermediate);
+  }
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+      break;
+    case 4:
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, intermediate_height, width, subpixel_y,
+              filter_index, step_y, height, prediction, pred_stride);
+        }
+      }
+  }
+}
+
+void ConvolveHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    uint8x8_t srcs[8];
+    srcs[0] = vld1_u8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vld1_u8(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1_u8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vld1_u8(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1_u8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vld1_u8(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1_u8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
+    int y = 0;
+    do {
+      srcs[next_row] = vld1_u8(src_x);
+      src_x += src_stride;
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+        vst1q_u16(dst16 + x + y * dst_stride, results);
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+        vst1_u8(dst8 + x + y * dst_stride, results);
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      srcs[0] = Load4<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load4<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+    int y = height;
+    do {
+      srcs[2] = Load4<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load4<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+    int y = height;
+    do {
+      srcs[4] = Load4<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[6] = Load4<0>(src, srcs[6]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+    srcs[4] = Load4<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[6] = Load4(src);
+    src += src_stride;
+    srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+    int y = height;
+    do {
+      srcs[6] = Load4<1>(src, srcs[6]);
+      src += src_stride;
+      srcs[8] = Load4<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int height,
+                       const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+      // This uses srcs[0]..srcs[1].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+      // This uses srcs[0]..srcs[3].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+      // This uses srcs[0]..srcs[5].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+      // This uses srcs[0]..srcs[7].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+void ConvolveVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 5 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9))) !=
+             0) {  // 6 tap with weird negative taps.
+    if (width == 2) {
+      FilterVertical2xH<1,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/false,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    // Outside taps are negative.
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint8x16_t v_src = vld1q_u8(&src[x]);
+        const uint16x8_t v_dest_lo =
+            vshll_n_u8(vget_low_u8(v_src), final_shift);
+        const uint16x8_t v_dest_hi =
+            vshll_n_u8(vget_high_u8(v_src), final_shift);
+        vst1q_u16(&dest[x], v_dest_lo);
+        x += 8;
+        vst1q_u16(&dest[x], v_dest_hi);
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      src += src_stride;
+      dest += width;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t v_src = vdup_n_u8(0);
+
+    int y = height;
+    do {
+      v_src = Load4<0>(&src[0], v_src);
+      src += src_stride;
+      v_src = Load4<1>(&src[0], v_src);
+      src += src_stride;
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      dest += 4 << 1;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
+  }
+
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 1) |
+               static_cast<int>(vertical_filter_id == 15))) != 0) {  // 5 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((static_cast<int>(filter_index == 1) &
+              (static_cast<int>(vertical_filter_id == 7) |
+               static_cast<int>(vertical_filter_id == 8) |
+               static_cast<int>(vertical_filter_id == 9))) !=
+             0) {  // 6 tap with weird negative taps.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true,
+                        /*negative_outside_taps=*/true>(src, src_stride, dest,
+                                                        4, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, width, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  } else {
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
+  }
+}
+
+void ConvolveCompoundHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+template <int vertical_taps>
+void Compound2DVertical(
+    const uint16_t* LIBGAV1_RESTRICT const intermediate_result, const int width,
+    const int height, const int16x8_t taps,
+    void* LIBGAV1_RESTRICT const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
+void ConvolveCompound2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
+  if (vertical_taps == 8) {
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 6) {
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
+  } else if (vertical_taps == 4) {
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
+  } else {  // |vertical_taps| == 2
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
+                              uint8_t* LIBGAV1_RESTRICT const dst) {
+  const uint8x16_t left = vld1q_u8(src);
+  const uint8x16_t right = vld1q_u8(src + 1);
+  vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint8x8_t left = vld1_u8(src);
+      const uint8x8_t right = vld1_u8(src + 1);
+      vst1_u8(dest, vrhadd_u8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = height;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint8x8_t result = vrhadd_u8(left, right);
+
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  uint8x16_t row[8], below[8];
+
+  row[0] = vld1q_u8(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = vld1q_u8(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = vld1q_u8(src);
+      src += 16;
+      row[3] = vld1q_u8(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = vld1q_u8(src);
+        src += 16;
+        row[5] = vld1q_u8(src);
+        src += 16;
+        row[6] = vld1q_u8(src);
+        src += 16;
+        row[7] = vld1q_u8(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = vld1q_u8(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = vld1q_u8(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = vld1q_u8(src);
+        src += 16;
+        below[3] = vld1q_u8(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = vld1q_u8(src);
+          src += 16;
+          below[5] = vld1q_u8(src);
+          src += 16;
+          below[6] = vld1q_u8(src);
+          src += 16;
+          below[7] = vld1q_u8(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    uint8x8_t row, below;
+    row = vld1_u8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = vld1_u8(src);
+      src += reference_stride;
+
+      vst1_u8(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {  // width == 4
+    uint8x8_t row = Load4(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load4<0>(src, below);
+      src += reference_stride;
+
+      StoreLo4(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      src += 8;
+      row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else {  // width == 4
+    uint8x8_t left = Load4(src);
+    uint8x8_t right = Load4(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = height;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
+  dsp->convolve[0][0][1][1] = Convolve2D_NEON;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h
new file mode 100644
index 0000000..9c67bc9
--- /dev/null
+++ b/src/dsp/arm/convolve_neon.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve. This function is not thread-safe.
+void ConvolveInit_NEON();
+void ConvolveInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Convolve2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
new file mode 100644
index 0000000..7d287c8
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -0,0 +1,357 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+constexpr int kInterPostRoundBit = 4;
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+                                         const int16x8_t pred1,
+                                         const int16x4_t weights[2]) {
+  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+  const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
+  const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
+  const int32x4_t blended_lo =
+      vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
+  const int32x4_t blended_hi =
+      vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
+
+  return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
+                      vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+}
+
+template <int width, int height>
+inline void DistanceWeightedBlendSmall_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  constexpr int step = 16 / width;
+
+  for (int y = 0; y < height; y += step) {
+    const int16x8_t src_00 = vld1q_s16(prediction_0);
+    const int16x8_t src_10 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const int16x8_t src_01 = vld1q_s16(prediction_0);
+    const int16x8_t src_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
+    const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const uint8x8_t result0 = vqmovun_s16(res0);
+    const uint8x8_t result1 = vqmovun_s16(res1);
+    if (width == 4) {
+      StoreLo4(dst, result0);
+      dst += dest_stride;
+      StoreHi4(dst, result0);
+      dst += dest_stride;
+      StoreLo4(dst, result1);
+      dst += dest_stride;
+      StoreHi4(dst, result1);
+      dst += dest_stride;
+    } else {
+      assert(width == 8);
+      vst1_u8(dst, result0);
+      dst += dest_stride;
+      vst1_u8(dst, result1);
+      dst += dest_stride;
+    }
+  }
+}
+
+inline void DistanceWeightedBlendLarge_NEON(
+    const int16_t* LIBGAV1_RESTRICT prediction_0,
+    const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+    const int width, const int height, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+      const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
+      const int16x8_t res_lo =
+          ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+
+      const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+      const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
+      const int16x8_t res_hi =
+          ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+
+      const uint8x16_t result =
+          vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+      vst1q_u8(dst + x, result);
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    prediction_0 += width;
+    prediction_1 += width;
+  } while (--y != 0);
+}
+
+inline void DistanceWeightedBlend_NEON(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
+  // TODO(johannkoenig): Investigate the branching. May be fine to call with a
+  // variable height.
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
+                                             dest_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
+        return;
+      case 8:
+        DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
+        return;
+      case 16:
+        DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+                                  dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+                                            const uint16x4x2_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+  const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+  const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x2_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+  return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+                                            const uint16x4x4_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+  const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+  const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+  const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+  const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+  const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+  const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+  const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+  const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+  const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x4_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+  result.val[2] =
+      vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+  result.val[3] =
+      vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+  return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+  uint16x4x2_t x;
+  // gcc/clang (64 bit) optimizes the following to ldp.
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+  uint16x4x4_t x;
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  x.val[2] = vld1_u16(ptr + 8);
+  x.val[3] = vld1_u16(ptr + 12);
+  return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                const uint8_t weight_0, const uint8_t weight_1,
+                                const int width, const int height,
+                                void* LIBGAV1_RESTRICT const dest,
+                                const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+
+  if (width == 4) {
+    int y = height;
+    do {
+      const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+      const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+      const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + dst_stride, res.val[1]);
+      dst += dst_stride << 1;
+      pred_0 += 8;
+      pred_1 += 8;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+      const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+      const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + 4, res.val[1]);
+      vst1_u16(dst + dst_stride, res.val[2]);
+      vst1_u16(dst + dst_stride + 4, res.val[3]);
+      dst += dst_stride << 1;
+      pred_0 += 16;
+      pred_1 += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+        const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+        const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+        vst1_u16(dst + x, res.val[0]);
+        vst1_u16(dst + x + 4, res.val[1]);
+        vst1_u16(dst + x + 8, res.val[2]);
+        vst1_u16(dst + x + 12, res.val[3]);
+        x += 16;
+      } while (x < width);
+      dst += dst_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h
new file mode 100644
index 0000000..94a799c
--- /dev/null
+++ b/src/dsp/arm/distance_weighted_blend_neon.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled signal the NEON implementation should be used instead of
+// normal C.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
new file mode 100644
index 0000000..0b1b481
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -0,0 +1,1479 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/arm/film_grain_neon.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+  return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+  return ZeroExtend(vld1_u8(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) {
+  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+  // causing test vector failures.
+  return ZeroExtend(Load1MsanU8(src, 0));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+  vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+  return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src,
+                                      int /*valid_range*/) {
+  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+  // causing test vector failures.
+  return vreinterpretq_s16_u16(Load1QMsanU16(src, 0));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+  vst1q_u16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+                                           const int16x8_t grain_hi,
+                                           int16_t coeff, int32x4x2_t sum) {
+  const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+  return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
+                                     int32x4x2_t sum,
+                                     const int8_t* LIBGAV1_RESTRICT coeffs,
+                                     int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+    int8_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(
+    int16_t* LIBGAV1_RESTRICT u_grain_cursor,
+    int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+    int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+    const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+  v->val[0] = vdupq_n_s32(0);
+  v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    const int8x16_t src0 = vld1q_s8(luma);
+    const int8x16_t src1 = vld1q_s8(luma + stride);
+    const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+                                        vpaddl_s8(vget_high_s8(src0)));
+    const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+                                        vpaddl_s8(vget_high_s8(src1)));
+    return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int8x16_t src = vld1q_s8(luma);
+    return vrshrq_n_s16(
+        vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+        1);
+  }
+  return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint8x16_t src = vld1q_u8(luma);
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  return vmovl_u8(vld1_u8(luma));
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
+                                     int subsampling_x, int /*valid_range*/) {
+  if (subsampling_x != 0) {
+    // TODO(b/194217060): restore |valid_range| usage after correcting call
+    // sites causing test vector failures.
+    const uint8x16_t src = Load1QMsanU8(luma, 0);
+
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+  // causing test vector failures.
+  return vmovl_u8(Load1MsanU8(luma, 0));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    int16x8_t src0_lo = vld1q_s16(luma);
+    int16x8_t src0_hi = vld1q_s16(luma + 8);
+    const int16x8_t src1_lo = vld1q_s16(luma + stride);
+    const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+    const int16x8_t src0 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+                     vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+    const int16x8_t src1 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+                     vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+    return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int16x8_t src_lo = vld1q_s16(luma);
+    const int16x8_t src_hi = vld1q_s16(luma + 8);
+    const int16x8_t ret =
+        vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+                     vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+    return vrshrq_n_s16(ret, 1);
+  }
+  return vld1q_s16(luma);
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
+                                 int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint16x8x2_t src = vld2q_u16(luma);
+    return vrhaddq_u16(src.val[0], src.val[1]);
+  }
+  return vld1q_u16(luma);
+}
+
+inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
+                                     int subsampling_x, int /*valid_range*/) {
+  if (subsampling_x != 0) {
+    // TODO(b/194217060): restore |valid_range| usage after correcting call
+    // sites causing test vector failures.
+    const uint16x8x2_t src = Load2QMsanU16(luma, 0);
+    return vrhaddq_u16(src.val[0], src.val[1]);
+  }
+  // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+  // causing test vector failures.
+  return Load1QMsanU16(luma, 0);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
+  static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int auto_regression_shift = params.auto_regression_shift;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  // When |chroma_width| == 44, we write 8 at a time from x in [3, 34],
+  // leaving [35, 40] to write at the end.
+  const int chroma_width_remainder =
+      (chroma_width - 2 * kAutoRegressionBorder) & 7;
+
+  int y = kAutoRegressionBorder;
+  luma_grain += kLumaWidth * y;
+  u_grain += chroma_width * y;
+  v_grain += chroma_width * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    int luma_x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum_u;
+      int32x4x2_t sum_v;
+      SetZero(&sum_u);
+      SetZero(&sum_v);
+
+      if (auto_regression_coeff_lag > 0) {
+        for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+             ++delta_row) {
+          // These loads may overflow to the next row, but they are never called
+          // on the final row of a grain block. Therefore, they will never
+          // exceed the block boundaries.
+          // Note: this could be slightly optimized to a single load in 8bpp,
+          // but requires making a special first iteration and accumulate
+          // function that takes an int8x16_t.
+          const int16x8_t u_grain_lo =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t u_grain_hi =
+              GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+          const int16x8_t v_grain_lo =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag);
+          const int16x8_t v_grain_hi =
+              GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                               auto_regression_coeff_lag + 8);
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                                  \
+  sum_u = AccumulateWeightedGrain<offset>(                                 \
+      u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \
+  sum_v = AccumulateWeightedGrain<offset>(                                 \
+      v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v)
+
+          ACCUMULATE_WEIGHTED_GRAIN(0);
+          ACCUMULATE_WEIGHTED_GRAIN(1);
+          ACCUMULATE_WEIGHTED_GRAIN(2);
+          // The horizontal |auto_regression_coeff_lag| loop is replaced with
+          // if-statements to give vextq_s16 an immediate param.
+          if (auto_regression_coeff_lag > 1) {
+            ACCUMULATE_WEIGHTED_GRAIN(3);
+            ACCUMULATE_WEIGHTED_GRAIN(4);
+          }
+          if (auto_regression_coeff_lag > 2) {
+            assert(auto_regression_coeff_lag == 3);
+            ACCUMULATE_WEIGHTED_GRAIN(5);
+            ACCUMULATE_WEIGHTED_GRAIN(6);
+          }
+        }
+      }
+
+      if (use_luma) {
+        const int16x8_t luma = GetSubsampledLuma(
+            luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+        // Luma samples get the final coefficient in the formula, but are best
+        // computed all at once before the final row.
+        const int coeff_u =
+            params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+        const int coeff_v =
+            params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+        sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+        sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+        sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+        sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                                    \
+  WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>(  \
+      u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \
+      params.auto_regression_coeff_v, pos, auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+
+      x += 8;
+      luma_x += 8 << subsampling_x;
+    } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder);
+
+    // This is the "final iteration" of the above loop over width. We fill in
+    // the remainder of the width, which is less than 8.
+    int pos = 0;
+    int32x4x2_t sum_u;
+    int32x4x2_t sum_v;
+    SetZero(&sum_u);
+    SetZero(&sum_v);
+
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      // These loads may overflow to the next row, but they are never called on
+      // the final row of a grain block. Therefore, they will never exceed the
+      // block boundaries.
+      const int16x8_t u_grain_lo = GetSignedSource8(
+          u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t u_grain_hi =
+          GetSignedSource8(u_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+      const int16x8_t v_grain_lo = GetSignedSource8(
+          v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag);
+      const int16x8_t v_grain_hi =
+          GetSignedSource8(v_grain + x + delta_row * chroma_width -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+
+    if (use_luma) {
+      const int16x8_t luma = GetSubsampledLuma(
+          luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth);
+
+      // Luma samples get the final coefficient in the formula, but are best
+      // computed all at once before the final row.
+      const int coeff_u =
+          params.auto_regression_coeff_u[pos + auto_regression_coeff_lag];
+      const int coeff_v =
+          params.auto_regression_coeff_v[pos + auto_regression_coeff_lag];
+
+      sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u);
+      sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u);
+      sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v);
+      sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v);
+    }
+
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    if (chroma_width_remainder == 6) {
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+    }
+
+    luma_grain += kLumaWidth << subsampling_y;
+    u_grain += chroma_width;
+    v_grain += chroma_width;
+  } while (++y < chroma_height);
+#undef ACCUMULATE_WEIGHTED_GRAIN
+#undef WRITE_AUTO_REGRESSION_RESULT
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag>
+void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
+                                               void* luma_grain_buffer) {
+  static_assert(auto_regression_coeff_lag > 0, "");
+  const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y;
+  const uint8_t auto_regression_shift = params.auto_regression_shift;
+
+  int y = kAutoRegressionBorder;
+  auto* luma_grain =
+      static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y;
+  do {
+    // Each row is computed 8 values at a time in the following loop. At the
+    // end of the loop, 4 values remain to write. They are given a special
+    // reduced iteration at the end.
+    int x = kAutoRegressionBorder;
+    do {
+      int pos = 0;
+      int32x4x2_t sum;
+      SetZero(&sum);
+      for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+           ++delta_row) {
+        // These loads may overflow to the next row, but they are never called
+        // on the final row of a grain block. Therefore, they will never exceed
+        // the block boundaries.
+        const int16x8_t src_grain_lo =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag);
+        const int16x8_t src_grain_hi =
+            GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                             auto_regression_coeff_lag + 8);
+
+        // A pictorial representation of the auto-regressive filter for
+        // various values of params.auto_regression_coeff_lag. The letter 'O'
+        // represents the current sample. (The filter always operates on the
+        // current sample with filter coefficient 1.) The letters 'X'
+        // represent the neighboring samples that the filter operates on, below
+        // their corresponding "offset" number.
+        //
+        // params.auto_regression_coeff_lag == 3:
+        //   0 1 2 3 4 5 6
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X X X X X
+        //   X X X O
+        // params.auto_regression_coeff_lag == 2:
+        //     0 1 2 3 4
+        //     X X X X X
+        //     X X X X X
+        //     X X O
+        // params.auto_regression_coeff_lag == 1:
+        //       0 1 2
+        //       X X X
+        //       X O
+        // params.auto_regression_coeff_lag == 0:
+        //         O
+        // The function relies on the caller to skip the call in the 0 lag
+        // case.
+
+#define ACCUMULATE_WEIGHTED_GRAIN(offset)                           \
+  sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \
+                                        auto_regression_coeff_y[pos++], sum)
+        ACCUMULATE_WEIGHTED_GRAIN(0);
+        ACCUMULATE_WEIGHTED_GRAIN(1);
+        ACCUMULATE_WEIGHTED_GRAIN(2);
+        // The horizontal |auto_regression_coeff_lag| loop is replaced with
+        // if-statements to give vextq_s16 an immediate param.
+        if (auto_regression_coeff_lag > 1) {
+          ACCUMULATE_WEIGHTED_GRAIN(3);
+          ACCUMULATE_WEIGHTED_GRAIN(4);
+        }
+        if (auto_regression_coeff_lag > 2) {
+          assert(auto_regression_coeff_lag == 3);
+          ACCUMULATE_WEIGHTED_GRAIN(5);
+          ACCUMULATE_WEIGHTED_GRAIN(6);
+        }
+      }
+      // At this point in the filter, the source addresses and destination
+      // addresses overlap. Because this is an auto-regressive filter, the
+      // higher lanes cannot be computed without the results of the lower lanes.
+      // Each call to WriteFinalAutoRegression incorporates preceding values
+      // on the final row, and writes a single sample. This allows the next
+      // pixel's value to be computed in the next call.
+#define WRITE_AUTO_REGRESSION_RESULT(lane)                             \
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \
+      luma_grain + x, sum, auto_regression_coeff_y, pos,               \
+      auto_regression_shift)
+
+      WRITE_AUTO_REGRESSION_RESULT(0);
+      WRITE_AUTO_REGRESSION_RESULT(1);
+      WRITE_AUTO_REGRESSION_RESULT(2);
+      WRITE_AUTO_REGRESSION_RESULT(3);
+      WRITE_AUTO_REGRESSION_RESULT(4);
+      WRITE_AUTO_REGRESSION_RESULT(5);
+      WRITE_AUTO_REGRESSION_RESULT(6);
+      WRITE_AUTO_REGRESSION_RESULT(7);
+      x += 8;
+      // Leave the final four pixels for the special iteration below.
+    } while (x < kLumaWidth - kAutoRegressionBorder - 4);
+
+    // Final 4 pixels in the row.
+    int pos = 0;
+    int32x4x2_t sum;
+    SetZero(&sum);
+    for (int delta_row = -auto_regression_coeff_lag; delta_row < 0;
+         ++delta_row) {
+      const int16x8_t src_grain_lo = GetSignedSource8(
+          luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag);
+      const int16x8_t src_grain_hi =
+          GetSignedSource8(luma_grain + x + delta_row * kLumaWidth -
+                           auto_regression_coeff_lag + 8);
+
+      ACCUMULATE_WEIGHTED_GRAIN(0);
+      ACCUMULATE_WEIGHTED_GRAIN(1);
+      ACCUMULATE_WEIGHTED_GRAIN(2);
+      // The horizontal |auto_regression_coeff_lag| loop is replaced with
+      // if-statements to give vextq_s16 an immediate param.
+      if (auto_regression_coeff_lag > 1) {
+        ACCUMULATE_WEIGHTED_GRAIN(3);
+        ACCUMULATE_WEIGHTED_GRAIN(4);
+      }
+      if (auto_regression_coeff_lag > 2) {
+        assert(auto_regression_coeff_lag == 3);
+        ACCUMULATE_WEIGHTED_GRAIN(5);
+        ACCUMULATE_WEIGHTED_GRAIN(6);
+      }
+    }
+    // delta_row == 0
+    WRITE_AUTO_REGRESSION_RESULT(0);
+    WRITE_AUTO_REGRESSION_RESULT(1);
+    WRITE_AUTO_REGRESSION_RESULT(2);
+    WRITE_AUTO_REGRESSION_RESULT(3);
+    luma_grain += kLumaWidth;
+  } while (++y < kLumaHeight);
+
+#undef WRITE_AUTO_REGRESSION_RESULT
+#undef ACCUMULATE_WEIGHTED_GRAIN
+}
+
+template <int bitdepth>
+void InitializeScalingLookupTable_NEON(int num_points,
+                                       const uint8_t point_value[],
+                                       const uint8_t point_scaling[],
+                                       int16_t* scaling_lut,
+                                       const int scaling_lut_length) {
+  static_assert(bitdepth < kBitdepth12,
+                "NEON Scaling lookup table only supports 8bpp and 10bpp.");
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+    return;
+  }
+  static_assert(sizeof(scaling_lut[0]) == 2, "");
+  Memset(scaling_lut, point_scaling[0],
+         std::max(static_cast<int>(point_value[0]), 1)
+             << (bitdepth - kBitdepth8));
+  const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
+  const int32x4_t rounding = vdupq_n_s32(32768);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    // |delta| corresponds to b, for the function y = a + b*x.
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    const int delta4 = delta << 2;
+    // vmull_n_u16 will not work here because |delta| typically exceeds the
+    // range of uint16_t.
+    int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
+    const int32x4_t line_increment4 = vdupq_n_s32(delta4);
+    // Get the second set of 4 points by adding 4 steps to the first set.
+    int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
+    // We obtain the next set of 8 points by adding 8 steps to each of the
+    // current 8 points.
+    const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
+    const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
+    int x = 0;
+    // Derive and write 8 values (or 32 values, for 10bpp).
+    do {
+      const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
+      const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
+      const int16x8_t interp_points =
+          vcombine_s16(interp_points0, interp_points1);
+      // The spec guarantees that the max value of |point_value[i]| + x is 255.
+      // Writing 8 values starting at the final table byte, leaves 7 values of
+      // required padding.
+      const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
+      const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
+      if (bitdepth == kBitdepth10) {
+        const int16x8_t next_val = vaddq_s16(
+            base_point,
+            vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
+        const int16x8_t start = full_interp;
+        const int16x8_t end = vextq_s16(full_interp, next_val, 1);
+        // lut[i << 2] = start;
+        // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
+        // lut[(i << 2) + 2] = start +
+        //                      RightShiftWithRounding(2 * (start - end), 2)
+        // lut[(i << 2) + 3] = start +
+        //                      RightShiftWithRounding(3 * (start - end), 2)
+        const int16x8_t delta = vsubq_s16(end, start);
+        const int16x8_t double_delta = vshlq_n_s16(delta, 1);
+        const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
+        const int16x8_t delta3 =
+            vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
+        const int16x8x4_t result = {
+            start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
+            vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
+        vst4q_s16(&scaling_lut[x_base], result);
+      } else {
+        vst1q_s16(&scaling_lut[x_base], full_interp);
+      }
+      upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
+      upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
+      x += 8;
+    } while (x < delta_x);
+  }
+  const int16_t last_point_value = point_value[num_points - 1];
+  const int x_base = last_point_value << (bitdepth - kBitdepth8);
+  Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+         scaling_lut_length - x_base);
+  if (bitdepth == kBitdepth10 && x_base > 0) {
+    const int start = scaling_lut[x_base - 4];
+    const int end = point_scaling[num_points - 1];
+    const int delta = end - start;
+    scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
+    scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
+    scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
+  }
+}
+
+inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
+                       const int16x8_t high) {
+  const int16x8_t clipped_to_ceiling = vminq_s16(high, value);
+  return vmaxq_s16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(
+    const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+  int16_t start_vals[8];
+  static_assert(bitdepth <= kBitdepth10,
+                "NEON Film Grain is not yet implemented for 12bpp.");
+  for (int i = 0; i < 8; ++i) {
+    assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+    start_vals[i] = scaling_lut[source[i]];
+  }
+  return vld1q_s16(start_vals);
+}
+
+template <int bitdepth>
+inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
+                            const int16x8_t scaling_shift_vect) {
+  if (bitdepth == kBitdepth8) {
+    const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+    return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+  }
+  // Scaling shift is in the range [8, 11]. The doubling multiply returning high
+  // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
+  // provide a left shift equal to 15 - s, where s is the original shift
+  // parameter.
+  const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
+  return vqrdmulhq_s16(noise, scaling_up);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_NEON(
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_luma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(
+      (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // This operation on the unsigned input is safe in 8bpp because the vector
+      // is widened before it is reinterpreted.
+      const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
+      const int16x8_t scaling0 =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      int16x8_t noise =
+          GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
+      const int16x8_t combined0 = vaddq_s16(orig0, noise);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case, though the gain would be very small.
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
+      x += 8;
+
+      // This operation on the unsigned input is safe in 8bpp because the vector
+      // is widened before it is reinterpreted.
+      const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
+      const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>(
+          scaling_lut_y, &in_y_row[std::min(x, width)]);
+      noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
+      const int16x8_t combined1 = vaddq_s16(orig1, noise);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case, though the gain would be very small.
+      StoreUnsigned8(&out_y_row[x],
+                     vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
+      x += 8;
+    } while (x < width);
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline int16x8_t BlendChromaValsWithCfl(
+    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t scaling_shift_vect) {
+  const int16x8_t scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const int16x8_t orig = GetSignedSource8(chroma_cursor);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  Pixel luma_buffer[16];
+  memset(luma_buffer, 0, sizeof(luma_buffer));
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(
+      (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const uint16x8_t average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), scaling_shift_vect);
+
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      x += 8;
+    } while (x < safe_chroma_width);
+
+    if (x < chroma_width) {
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const uint16x8_t average_luma = GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]));
+
+      StoreUnsigned8(average_luma_buffer, average_luma);
+
+      const int16x8_t blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), scaling_shift_vect);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+      // clipping with vqmovun_s16, but it's not likely to be worth copying the
+      // function for just that case.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  // Looping over one plane at a time is faster in higher resolutions, despite
+  // re-computing luma.
+  BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline int16x8_t BlendChromaValsNoCfl(
+    const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+    const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+  uint8_t merged_buffer[8];
+  const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
+  const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
+  // Maximum value of |combined_u| is 127*255 = 0x7E81.
+  const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma);
+  // Maximum value of u_offset is (255 << 5) = 0x1FE0.
+  // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
+  const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
+  vst1_u8(merged_buffer, merged);
+  const int16x8_t scaling =
+      GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+  int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
+  // for 16 bit signed integers. In higher bitdepths, however, we have to
+  // expand to 32 to protect the sign bit.
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+  uint8_t luma_buffer[16];
+#if LIBGAV1_MSAN
+  // Quiet msan warnings.
+  memset(luma_buffer, 0, sizeof(luma_buffer));
+#endif
+  const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+
+      const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+      const int16x8_t average_luma = vreinterpretq_s16_u16(
+          GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      // In 8bpp, when params_.clip_to_restricted_range == false, we can
+      // replace clipping with vqmovun_s16, but the gain would be small.
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+
+      x += 8;
+    } while (x < safe_chroma_width);
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const int valid_range_chroma_bytes =
+          (chroma_width - x) * sizeof(in_chroma_row[0]);
+
+      const int16x8_t orig_chroma =
+          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width,
+                            height, start_height, subsampling_x, subsampling_y,
+                            params.chroma_scaling, offset, multiplier,
+                            luma_multiplier, scaling_lut, in_y, source_stride_y,
+                            in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+inline void WriteOverlapLine8bpp_NEON(
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
+    const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    const int8x8_t grain_coeff, const int8x8_t old_coeff,
+    int8_t* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    // Note that these reads may exceed noise_stripe_row's width by up to 7
+    // bytes.
+    const int8x8_t source_grain = vld1_s8(noise_stripe_row + x);
+    const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x);
+    const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain);
+    const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old);
+    // Note that this write may exceed noise_image_row's width by up to 7 bytes.
+    vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5));
+    x += 8;
+  } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap8bpp_NEON(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(17);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(27);
+    const int8x8_t second_row_grain_coeff = first_row_old_coeff;
+    const int8x8_t second_row_old_coeff = first_row_grain_coeff;
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    const int remaining_height = plane_height - y;
+    if (remaining_height <= 0) {
+      return;
+    }
+    const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+    const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine8bpp_NEON(
+        noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+        first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+    if (remaining_height > 1) {
+      WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width],
+                                &noise_stripe_prev[(32 + 1) * plane_width],
+                                plane_width, second_row_grain_coeff,
+                                second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+  } else {  // subsampling_y == 1
+    const int8x8_t first_row_grain_coeff = vdup_n_s8(22);
+    const int8x8_t first_row_old_coeff = vdup_n_s8(23);
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const int8_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine8bpp_NEON(
+          noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
+
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap8bpp_NEON;
+
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_NEON<kBitdepth8>;
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline void WriteOverlapLine10bpp_NEON(
+    const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
+    const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    const int16x8_t grain_coeff, const int16x8_t old_coeff,
+    int16_t* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    // Note that these reads may exceed noise_stripe_row's width by up to 7
+    // values.
+    const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
+    const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
+    // Maximum product is 511 * 27 = 0x35E5.
+    const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
+    // Maximum sum is 511 * (22 + 23) = 0x59D3.
+    const int16x8_t grain_sum =
+        vmlaq_s16(weighted_grain, old_coeff, source_old);
+    // Note that this write may exceed noise_image_row's width by up to 7
+    // values.
+    const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
+                                     vdupq_n_s16(GetGrainMin<kBitdepth10>()),
+                                     vdupq_n_s16(GetGrainMax<kBitdepth10>()));
+    vst1q_s16(noise_image_row + x, grain);
+    x += 8;
+  } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap10bpp_NEON(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
+    const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
+    const int16x8_t second_row_grain_coeff = first_row_old_coeff;
+    const int16x8_t second_row_old_coeff = first_row_grain_coeff;
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine10bpp_NEON(
+          noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+      WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+                                 &noise_stripe_prev[(32 + 1) * plane_width],
+                                 plane_width, second_row_grain_coeff,
+                                 second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+    // Either one partial stripe remains (remaining_height > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    const int remaining_height = plane_height - y;
+    if (remaining_height <= 0) {
+      return;
+    }
+    const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+    const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine10bpp_NEON(
+        noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+        first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+    if (remaining_height > 1) {
+      WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+                                 &noise_stripe_prev[(32 + 1) * plane_width],
+                                 plane_width, second_row_grain_coeff,
+                                 second_row_old_coeff, (*noise_image)[y + 1]);
+    }
+  } else {  // subsampling_y == 1
+    const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
+    const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+      const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine10bpp_NEON(
+          noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+          first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+    }
+  }
+}
+
+inline int16x8_t BlendChromaValsNoCfl(
+    const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+    const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+    const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) {
+  uint16_t merged_buffer[8];
+  const int32x4_t weighted_luma_low =
+      vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
+  const int32x4_t weighted_luma_high =
+      vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
+  // Maximum value of combined is 127 * 1023 = 0x1FB81.
+  const int32x4_t combined_low =
+      vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
+  const int32x4_t combined_high =
+      vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
+  // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
+  const uint16x4_t merged_low =
+      vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
+  const uint16x4_t merged_high =
+      vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
+  const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
+  vst1q_u16(merged_buffer,
+            vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
+  const int16x8_t scaling =
+      GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer);
+  const int16x8_t noise = GetSignedSource8(noise_image_cursor);
+  const int16x8_t scaled_noise =
+      ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
+  return vaddq_s16(orig, scaled_noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
+    const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const int16x8_t floor = vdupq_n_s16(min_value);
+  const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+  const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int safe_chroma_width = chroma_width & ~7;
+  uint16_t luma_buffer[16];
+#if LIBGAV1_MSAN
+  // TODO(b/194217060): This can be removed if the range calculations below are
+  // fixed.
+  memset(luma_buffer, 0, sizeof(luma_buffer));
+#endif
+  // Offset is added before downshifting in order to take advantage of
+  // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
+  const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int16x8_t average_luma = vreinterpretq_s16_u16(
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+      const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+
+      x += 8;
+    } while (x < safe_chroma_width);
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range_pixels = width - luma_x;
+      const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+      luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+      const int valid_range_chroma_bytes =
+          (chroma_width - x) * sizeof(in_chroma_row[0]);
+      const int16x8_t orig_chroma =
+          GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+
+      const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+          luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+      const int16x8_t blended = BlendChromaValsNoCfl(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, scaling_shift_vect, offset, luma_multiplier,
+          chroma_multiplier);
+      StoreUnsigned8(&out_chroma_row[x],
+                     vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+      // End of right edge iteration.
+    }
+
+    in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
+    in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
+    out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma10bpp_NEON(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int16_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane10bpp_NEON(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
+
+  // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
+  // Chroma autoregression should never be called when lag is 0 and use_luma
+  // is false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+                                                   false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+                                                   true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+                                                   true>;
+
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap10bpp_NEON;
+
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_NEON<kBitdepth10>;
+
+  // TODO(b/194442742): reenable this function after segfault under armv7 ASan
+  // is fixed.
+  // dsp->film_grain.blend_noise_luma =
+  //     BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_NEON() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
new file mode 100644
index 0000000..3ba2eef
--- /dev/null
+++ b/src/dsp/arm/film_grain_neon.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+// TODO(b/194442742): reenable this function after segfault under armv7 ASan is
+// fixed.
+// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_
diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc
new file mode 100644
index 0000000..9b20e29
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.cc
@@ -0,0 +1,523 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
+// required.
+constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+                                         vcreate_u8(0x0f0e0d0c0b0a0908));
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint8_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint8x16_t src_0 = vld1q_u8(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 15; i += 16) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+      // Load the next row before overwriting. This loads an extra 15 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u8(dst_buffer + i + 15);
+
+      vst1q_u8(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0xf;
+    if (remainder > 1) {
+      const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
+      const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
+
+      uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
+      sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
+      sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
+      uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
+      sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
+      sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
+
+      const uint8x16_t result =
+          vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+      const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+      // Create over write mask.
+      const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+      const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+      vst1q_u8(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint8_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
+  uint8x16_t src_1 = vld1q_u8(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 15; i += 16) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+
+    src_0 = vld1q_u8(dst_buffer + i + 14);
+    src_1 = vld1q_u8(dst_buffer + i + 15);
+
+    vst1q_u8(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0xf;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
+    const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
+    const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
+
+    uint16x8_t sum_lo =
+        vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
+    const uint16x8_t sum_123_lo = vaddw_u8(
+        vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
+    sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
+
+    uint16x8_t sum_hi =
+        vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
+    const uint16x8_t sum_123_hi =
+        vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
+                 vget_high_u8(src_3));
+    sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
+
+    const uint8x16_t result =
+        vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
+    const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+    // Create over write mask.
+    const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+    const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+    vst1q_u8(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
+uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
+                   const uint8x8_t src2, const uint8x8_t src3) {
+  const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
+  const uint16x8_t ends = vaddl_u8(src0, src3);
+  const int16x8_t sum =
+      vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
+  return vqrshrun_n_s16(sum, 4);
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  // This is OK because we don't read this value for |size| 4 or 8 but if we
+  // write |pixel_buffer[size]| and then vld() it, that seems to introduce
+  // some latency.
+  pixel_buffer[-2] = pixel_buffer[-1];
+  if (size == 4) {
+    // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
+    const uint8x8_t src = vld1_u8(pixel_buffer - 1);
+    // The outside values are negated so put those in the same vector.
+    const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
+    // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
+    // end.
+    const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
+
+    const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
+    const int16x8_t half_sum = vsubq_s16(
+        vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
+    const int16x4_t sum =
+        vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
+    const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
+
+    vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
+    return;
+  }
+  if (size == 8) {
+    // Likewise, one load + multiple vtbls seems preferred to multiple loads.
+    const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
+    const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
+    const uint8x8_t src1 = vget_low_u8(src);
+    const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
+    const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
+
+    const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
+    vst2_u8(pixel_buffer - 1, output);
+    return;
+  }
+  assert(size == 12 || size == 16);
+  // Extend the input borders to avoid branching later.
+  pixel_buffer[size] = pixel_buffer[size - 1];
+  const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
+  const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
+  const uint8x16_t src2 = vld1q_u8(pixel_buffer);
+  const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
+
+  const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
+                                       vget_low_u8(src2), vget_low_u8(src3));
+
+  const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
+  vst2_u8(pixel_buffer - 1, output_lo);
+
+  const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
+                                       vget_high_u8(src2), vget_high_u8(src3));
+
+  if (size == 12) {
+    vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
+  } else /* size == 16 */ {
+    const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
+    vst2_u8(pixel_buffer + 15, output_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint16_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+    const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint16x8_t src_0 = vld1q_u16(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 7; i += 8) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      // Load the next row before overwriting. This loads an extra 7 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u16(dst_buffer + i + 7);
+      vst1q_u16(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0x7;
+    if (remainder > 1) {
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+      const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+      vst1q_u16(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint16_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+  uint16x8_t src_1 = vld1q_u16(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 7; i += 8) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+    // Load the next before overwriting.
+    src_0 = vld1q_u16(dst_buffer + i + 6);
+    src_1 = vld1q_u16(dst_buffer + i + 7);
+
+    vst1q_u16(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0x7;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+    const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+    const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+    vst1q_u16(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
+
+  // Extend first/last samples
+  pixel_buffer[-2] = pixel_buffer[-1];
+  pixel_buffer[size] = pixel_buffer[size - 1];
+
+  const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+  const int16x8_t src_hi =
+      vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+  const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+  const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+  int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+  sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+  sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+  sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+  uint16x8x2_t result_lo;
+  result_lo.val[0] =
+      vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+                vdupq_n_u16((1 << kBitdepth10) - 1));
+  result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+  if (size > 8) {
+    const int16x8_t src_hi_extra =
+        vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+    const int16x8_t src9_hi_extra =
+        vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+    int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+    sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+    sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+    sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+    uint16x8x2_t result_hi;
+    result_hi.val[0] =
+        vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+                  vdupq_n_u16((1 << kBitdepth10) - 1));
+    result_hi.val[1] =
+        vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+    vst2q_u16(pixel_buffer - 1, result_lo);
+    vst2q_u16(pixel_buffer + 15, result_hi);
+  } else {
+    vst2q_u16(pixel_buffer - 1, result_lo);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h
new file mode 100644
index 0000000..28e3494
--- /dev/null
+++ b/src/dsp/arm/intra_edge_neon.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc
new file mode 100644
index 0000000..ad39947
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.cc
@@ -0,0 +1,1327 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Divide by the number of elements.
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
+  return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
+}
+
+// Subtract |val| from every element in |a|.
+inline void BlockSubtract(const uint32_t val,
+                          int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                          const int width, const int height) {
+  assert(val <= INT16_MAX);
+  const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
+
+  for (int y = 0; y < height; ++y) {
+    if (width == 4) {
+      const int16x4_t b = vld1_s16(a[y]);
+      vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v)));
+    } else if (width == 8) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+    } else if (width == 16) {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+    } else /* block_width == 32 */ {
+      const int16x8_t b = vld1q_s16(a[y]);
+      const int16x8_t c = vld1q_s16(a[y] + 8);
+      const int16x8_t d = vld1q_s16(a[y] + 16);
+      const int16x8_t e = vld1q_s16(a[y] + 24);
+      vst1q_s16(a[y], vsubq_s16(b, val_v));
+      vst1q_s16(a[y] + 8, vsubq_s16(c, val_v));
+      vst1q_s16(a[y] + 16, vsubq_s16(d, val_v));
+      vst1q_s16(a[y] + 24, vsubq_s16(e, val_v));
+    }
+  }
+}
+
+namespace low_bitdepth {
+namespace {
+
+template <int block_width, int block_height>
+void CflSubsampler420_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 8);
+    uint32x2_t running_sum = vdup_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t row0 = vld1_u8(src);
+      const uint8x8_t row1 = vld1_u8(src + stride);
+
+      uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1);
+      sum_row = vshl_n_u16(sum_row, 1);
+      running_sum = vpadal_u16(running_sum, sum_row);
+      vst1_s16(luma[y], vreinterpret_s16_u16(sum_row));
+
+      if (y << 1 < max_luma_height - 2) {
+        // Once this threshold is reached the loop could be simplified.
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+    const uint16x8_t x_max_index =
+        vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+    const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x16_t row0 = vld1q_u8(src);
+      const uint8x16_t row1 = vld1q_u8(src + stride);
+      const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+      const uint16x8_t final_sum_row =
+          vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
+
+      running_sum = vpadalq_u16(running_sum, final_sum_row);
+
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      // Calculate the 2x2 sum at the max_luma offset
+      const uint8_t a00 = src[max_luma_width - 2];
+      const uint8_t a01 = src[max_luma_width - 1];
+      const uint8_t a10 = src[max_luma_width - 2 + stride];
+      const uint8_t a11 = src[max_luma_width - 1 + stride];
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_n_u16(static_cast<uint16_t>((a00 + a01 + a10 + a11) << 1));
+      uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+
+      ptrdiff_t src_x_offset = 0;
+      for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+        const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+        const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+        const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+        const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+        const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+        const uint16x8_t final_sum_row =
+            vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
+
+        running_sum = vpadalq_u16(running_sum, final_sum_row);
+        x_index = vaddq_u16(x_index, vdupq_n_u16(16));
+      }
+
+      if (y << 1 < max_luma_height - 2) {
+        src += stride << 1;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+template <int block_width, int block_height>
+void CflSubsampler444_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t sum;
+  if (block_width == 4) {
+    assert(max_luma_width >= 4);
+    assert(max_luma_height <= block_height);
+    assert((max_luma_height % 2) == 0);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+    uint8x8_t row = vdup_n_u8(0);
+
+    uint16x8_t row_shifted;
+    int y = 0;
+    do {
+      row = Load4<0>(src, row);
+      row = Load4<1>(src + stride, row);
+      if (y < (max_luma_height - 1)) {
+        src += stride << 1;
+      }
+
+      row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+      y += 2;
+    } while (y < max_luma_height);
+
+    row_shifted =
+        vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+    for (; y < block_height; y += 2) {
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+    }
+
+    sum = SumVector(running_sum);
+  } else if (block_width == 8) {
+    const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7};
+    const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1);
+    const uint8x8_t x_mask = vclt_u8(x_index, x_max_index);
+
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]);
+      const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max);
+
+      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted));
+
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+
+    sum = SumVector(running_sum);
+  } else /* block_width >= 16 */ {
+    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1);
+    uint32x4_t running_sum = vdupq_n_u32(0);
+
+    for (int y = 0; y < block_height; ++y) {
+      uint8x16_t x_index = {0, 1, 2,  3,  4,  5,  6,  7,
+                            8, 9, 10, 11, 12, 13, 14, 15};
+      const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]);
+      for (int x = 0; x < block_width; x += 16) {
+        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+        const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max);
+
+        const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3);
+        const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3);
+        running_sum = vpadalq_u16(running_sum, row_shifted_low);
+        running_sum = vpadalq_u16(running_sum, row_shifted_high);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low));
+        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high));
+
+        x_index = vaddq_u8(x_index, vdupq_n_u8(16));
+      }
+      if (y < max_luma_height - 1) {
+        src += stride;
+      }
+    }
+    sum = SumVector(running_sum);
+  }
+
+  const uint32_t average = Average(sum, block_width, block_height);
+  BlockSubtract(average, luma, block_width, block_height);
+}
+
+// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t.
+inline uint8x8_t Combine8(const int16x8_t luma, const int alpha,
+                          const int16x8_t dc) {
+  const int16x8_t la = vmulq_n_s16(luma, alpha);
+  // Subtract the sign bit to round towards zero.
+  const int16x8_t sub_sign = vsraq_n_s16(la, la, 15);
+  // Shift and accumulate.
+  const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6);
+  return vqmovun_s16(result);
+}
+
+// The range of luma/alpha is not really important because it gets saturated to
+// uint8_t. Saturated int16_t >> 6 outranges uint8_t.
+template <int block_height>
+inline void CflIntraPredictor4xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const uint8x8_t sum =
+        Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc);
+    StoreLo4(dst, sum);
+    dst += stride;
+    StoreHi4(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint8x8_t sum = Combine8(luma_row, alpha, dc);
+    vst1_u8(dst, sum);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor16xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    dst += stride;
+  }
+}
+
+template <int block_height>
+inline void CflIntraPredictor32xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc);
+    const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc);
+    const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc);
+    const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc);
+    vst1_u8(dst, sum_0);
+    vst1_u8(dst + 8, sum_1);
+    vst1_u8(dst + 16, sum_2);
+    vst1_u8(dst + 24, sum_3);
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_NEON<32, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 8>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<4, 16>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 4>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 8>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 16>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<8, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 8>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 16>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<16, 32>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 8>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 16>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_NEON<32, 32>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+  const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+  const uint32x4_t b = vdupq_lane_u32(a, 1);
+  return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+  const uint16x4_t a = vget_high_u16(row);
+  const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+  return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+  const int16x4_t a = vget_high_s16(row);
+  const int16x8_t b = vdupq_lane_s16(a, 0x3);
+  return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+  vst1_s16(luma_ptr + kCflLumaBufferStride,
+           vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+  return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+  return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint16x4_t sum = vdup_n_u16(0);
+  uint16x4_t samples[2];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1_u16(src);
+    samples[1] = vld1_u16(src + src_stride);
+    src += src_stride << 1;
+    sum = vadd_u16(sum, samples[0]);
+    sum = vadd_u16(sum, samples[1]);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples[1] = vshl_n_u16(samples[1], 1);
+    do {
+      sum = vadd_u16(sum, samples[1]);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x4_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1_s16(ssrc);
+    ssample = vshl_n_s16(ssample, 3);
+    vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples;
+  int y = visible_height;
+
+  do {
+    samples = vld1q_u16(src);
+    src += src_stride;
+    sum = vpadalq_u16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = vpadalq_u16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(sum), block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1q_s16(ssrc);
+    ssample = vshlq_n_s16(ssample, 3);
+    vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1q_u16(src);
+    samples[1] =
+        (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+                                          : LastRowResult(samples[2]);
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    sum = vpadalq_u16(sum, inner_sum);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    do {
+      sum = vpadalq_u16(sum, inner_sum);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(sum), block_width_log2 + block_height_log2 - 3);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssamples_ext = vdupq_n_s16(0);
+  int16x8_t ssamples[4];
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        ssamples[idx] = vld1q_s16(&ssrc[x]);
+        ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+        ssamples_ext = ssamples[idx];
+      } else {
+        ssamples[idx] = LastRowResult(ssamples_ext);
+      }
+      vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+    }
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row0 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row1 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+    const uint16x8_t samples_row2 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row3 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+    uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const uint16x8_t samples_row4 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row5 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+    const uint16x8_t samples_row6 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row7 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+    sum =
+        vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  const uint16x4_t final_fill =
+      vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x4_t samples = vld1_s16(luma_ptr);
+    vst1_s16(luma_ptr, vsub_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    src += src_stride;
+    const uint16x8_t samples_row10 = vld1q_u16(src);
+    const uint16x8_t samples_row11 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row10);
+    src += src_stride;
+    const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+    uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row20 = vld1q_u16(src);
+    const uint16x8_t samples_row21 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row20);
+    src += src_stride;
+    const uint16x8_t samples_row30 = vld1q_u16(src);
+    const uint16x8_t samples_row31 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row30);
+    src += src_stride;
+    const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+    const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row40 = vld1q_u16(src);
+    const uint16x8_t samples_row41 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row40);
+    src += src_stride;
+    const uint16x8_t samples_row50 = vld1q_u16(src);
+    const uint16x8_t samples_row51 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row50);
+    src += src_stride;
+    const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+    const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row60 = vld1q_u16(src);
+    const uint16x8_t samples_row61 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row60);
+    src += src_stride;
+    const uint16x8_t samples_row70 = vld1q_u16(src);
+    const uint16x8_t samples_row71 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row70);
+    src += src_stride;
+    const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+    const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const uint16x8_t final_fill =
+      vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum =
+      vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+  for (y = luma_height; y < block_height; ++y) {
+    vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+                                                        source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+                                                         source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  uint16x8_t final_fill0, final_fill1;
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    const uint16x8_t samples_row02 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src + 16)
+                                         : LastRowSamples(samples_row01);
+    const uint16x8_t samples_row03 = (max_luma_width == 32)
+                                         ? vld1q_u16(src + 24)
+                                         : LastRowSamples(samples_row02);
+    const uint16x8_t samples_row10 = vld1q_u16(src_next);
+    const uint16x8_t samples_row11 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src_next + 8)
+                                         : LastRowSamples(samples_row10);
+    const uint16x8_t samples_row12 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src_next + 16)
+                                         : LastRowSamples(samples_row11);
+    const uint16x8_t samples_row13 = (max_luma_width == 32)
+                                         ? vld1q_u16(src_next + 24)
+                                         : LastRowSamples(samples_row12);
+    const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+    const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+    const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+    final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+    final_sum = vpadalq_u16(final_sum, sum);
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const uint16x8_t wide_fill = LastRowResult(final_fill1);
+      final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    uint32x4_t wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.  (a << 2) = (a + a) << 1.
+      wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+    }
+    const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+    const uint32x4_t final_fill_to_sum = vaddl_u16(
+        vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+    do {
+      vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+      vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+      if (block_width_log2 == 5) {
+        final_sum = vaddq_u32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_width_log2 + block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples0 = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+    const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+    const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+    vst1q_s16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const int16x8_t wide_fill = LastRowResult(final_row_result);
+      vst1q_s16(luma_ptr + 16, wide_fill);
+      vst1q_s16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+                           const int16x8_t alpha_signed, const int16x8_t dc,
+                           const uint16x8_t max_value) {
+  const int16x8_t luma_abs = vabsq_s16(luma);
+  const int16x8_t luma_alpha_sign =
+      vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+  // (alpha * luma) >> 6
+  const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+  // Convert back to signed values.
+  const int16x8_t la =
+      vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+  const int16x8_t result = vaddq_s16(la, dc);
+  const int16x8_t zero = vdupq_n_s16(0);
+  // Clip.
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+    const uint16x8_t sum =
+        Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+    vst1_u16(dst, vget_low_u16(sum));
+    dst += dst_stride;
+    vst1_u16(dst, vget_high_u16(sum));
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint16x8_t sum =
+        Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_2 =
+        Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_3 =
+        Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    vst1q_u16(dst + 16, sum_2);
+    vst1q_u16(dst + 24, sum_3);
+    dst += dst_stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 5>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_cfl_neon.h b/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
new file mode 100644
index 0000000..3cad4a6
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -0,0 +1,2177 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8x8_t a_weight,
+                               const uint8x8_t b_weight) {
+  const uint16x8_t a_product = vmull_u8(a, a_weight);
+  const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
+
+  return vrshrn_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// For vertical operations the weights are one constant value.
+inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
+                               const uint8_t weight) {
+  return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight));
+}
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
+  const uint8x16_t mixed = vld1q_u8(source);
+  *left = VQTbl1U8(mixed, left_step);
+  *right = VQTbl1U8(mixed, right_step);
+}
+
+// Handle signed step arguments by ignoring the sign. Negative values are
+// considered out of range and overwritten later.
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+                         const int8x8_t left_step, const int8x8_t right_step,
+                         uint8x8_t* left, uint8x8_t* right) {
+  LoadStepwise(source, vreinterpret_u8_s8(left_step),
+               vreinterpret_u8_s8(right_step), left, right);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep, const bool upsampled) {
+  assert(width == 4 || width == 8);
+
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+    // 4 wide subsamples the output. 8 wide subsamples the input.
+    if (width == 4) {
+      const uint8x8_t left_values = vld1_u8(top + top_base_x);
+      const uint8x8_t right_values = RightShiftVector<8>(left_values);
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+
+      // If |upsampled| is true then extract every other value for output.
+      const uint8x8_t value_stepped =
+          vtbl1_u8(value, vreinterpret_u8_s8(base_step));
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value_stepped, top_max_base);
+
+      StoreLo4(dst, masked_value);
+    } else /* width == 8 */ {
+      uint8x8_t left_values, right_values;
+      // WeightedBlend() steps up to Q registers. Downsample the input to avoid
+      // doing extra calculations.
+      LoadStepwise(top + top_base_x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst, masked_value);
+    }
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint8_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep, const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (width + height - 1) << upsample_shift;
+  const int8x8_t max_base = vdup_n_s8(max_base_x);
+  const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+
+  const int8x8_t all = vcreate_s8(0x0706050403020100);
+  const int8x8_t even = vcreate_s8(0x0e0c0a0806040200);
+  const int8x8_t base_step = upsampled ? even : all;
+  const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1));
+  const int8x8_t block_step = vdup_n_s8(8 << upsample_shift);
+
+  int top_x = xstep;
+  int y = 0;
+  do {
+    const int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dst, top[max_base_x], 4 /* width */);
+        dst += stride;
+      }
+      return;
+    }
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+
+    // Zone2 uses negative values for xstep. Use signed values to compare
+    // |top_base_x| to |max_base_x|.
+    int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step);
+
+    int x = 0;
+    do {
+      const uint8x8_t max_base_mask = vclt_s8(base_v, max_base);
+
+      // Extract the input values based on |upsampled| here to avoid doing twice
+      // as many calculations.
+      uint8x8_t left_values, right_values;
+      LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values,
+                   &right_values);
+
+      const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+      const uint8x8_t masked_value =
+          vbsl_u8(max_base_mask, value, top_max_base);
+
+      vst1_u8(dst + x, masked_value);
+
+      base_v = vadd_s8(base_v, block_step);
+      x += 8;
+    } while (x < width);
+    top_x += xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  assert(xstep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint8_t* top_ptr = top + 1;
+    int y = 0;
+    do {
+      memcpy(dst, top_ptr, width);
+      memcpy(dst + stride, top_ptr + 1, width);
+      memcpy(dst + 2 * stride, top_ptr + 2, width);
+      memcpy(dst + 3 * stride, top_ptr + 3, width);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y += 4;
+    } while (y < height);
+  } else if (width == 4) {
+    DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top);
+  } else if (xstep > 51) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsample_top| the delta is from vertical so |prediction_angle - 90|.
+    // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet
+    // this criteria. The |xstep| value for angle 51 happens to be 51 as well.
+    // Shallower angles have greater xstep values.
+    assert(!upsampled_top);
+    const int max_base_x = ((width + height) - 1);
+    const uint8x8_t max_base = vdup_n_u8(max_base_x);
+    const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]);
+    const uint8x8_t block_step = vdup_n_u8(8);
+
+    int top_x = xstep;
+    int y = 0;
+    do {
+      const int top_base_x = top_x >> 6;
+      const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+      uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all);
+      int x = 0;
+      // Only calculate a block of 8 when at least one of the output values is
+      // within range. Otherwise it can read off the end of |top|.
+      const int must_calculate_width =
+          std::min(width, max_base_x - top_base_x + 7) & ~7;
+      for (; x < must_calculate_width; x += 8) {
+        const uint8x8_t max_base_mask = vclt_u8(base_v, max_base);
+
+        // Since these |xstep| values can not be upsampled the load is
+        // simplified.
+        const uint8x8_t left_values = vld1_u8(top + top_base_x + x);
+        const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1);
+        const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
+        const uint8x8_t masked_value =
+            vbsl_u8(max_base_mask, value, top_max_base);
+
+        vst1_u8(dst + x, masked_value);
+        base_v = vadd_u8(base_v, block_step);
+      }
+      memset(dst + x, top[max_base_x], width - x);
+      dst += stride;
+      top_x += xstep;
+    } while (++y < height);
+  } else {
+    DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top);
+  }
+}
+
+// Process 4 or 8 |width| by 4 or 8 |height|.
+template <int width>
+inline void DirectionalZone3_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
+    const int ystep, const int upsample_shift) {
+  assert(width == 4 || width == 8);
+  assert(height == 4 || height == 8);
+  const int scale_bits = 6 - upsample_shift;
+
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+
+  // Limited improvement for 8x8. ~20% faster for 64x64.
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  uint8_t* dst = dest;
+  uint8x8_t left_v[8], right_v[8], value_v[8];
+  const uint8_t* const left = left_column;
+
+  const int index_0 = base_left_y;
+  LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step,
+               &left_v[0], &right_v[0]);
+  value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                             ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_1 = base_left_y + ystep;
+  LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step,
+               &left_v[1], &right_v[1]);
+  value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                             ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_2 = base_left_y + ystep * 2;
+  LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step,
+               &left_v[2], &right_v[2]);
+  value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                             ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_3 = base_left_y + ystep * 3;
+  LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step,
+               &left_v[3], &right_v[3]);
+  value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                             ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_4 = base_left_y + ystep * 4;
+  LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step,
+               &left_v[4], &right_v[4]);
+  value_v[4] = WeightedBlend(left_v[4], right_v[4],
+                             ((index_4 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_5 = base_left_y + ystep * 5;
+  LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step,
+               &left_v[5], &right_v[5]);
+  value_v[5] = WeightedBlend(left_v[5], right_v[5],
+                             ((index_5 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_6 = base_left_y + ystep * 6;
+  LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step,
+               &left_v[6], &right_v[6]);
+  value_v[6] = WeightedBlend(left_v[6], right_v[6],
+                             ((index_6 << upsample_shift) & 0x3F) >> 1);
+
+  const int index_7 = base_left_y + ystep * 7;
+  LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step,
+               &left_v[7], &right_v[7]);
+  value_v[7] = WeightedBlend(left_v[7], right_v[7],
+                             ((index_7 << upsample_shift) & 0x3F) >> 1);
+
+  // 8x8 transpose.
+  const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]),
+                                   vcombine_u8(value_v[1], value_v[5]));
+  const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]),
+                                   vcombine_u8(value_v[3], value_v[7]));
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  if (width == 4) {
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  } else {
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0])));
+    if (height == 4) return;
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1])));
+    dst += stride;
+    vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1])));
+  }
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone2FromLeftCol_WxH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  // The shift argument must be a constant.
+  int16x8_t offset_y, shift_upsampled = left_y;
+  if (upsample_shift) {
+    offset_y = vshrq_n_s16(left_y, 5);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, 6);
+  }
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  const int16x8_t sampler =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset));
+  const uint8x8_t left_values = vqmovun_s16(sampler);
+  const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1));
+  const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul);
+
+  int y = 0;
+  do {
+    uint8x8_t src_left, src_right;
+    LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift),
+                 left_values, right_values, &src_left, &src_right);
+    const uint8x8_t val =
+        WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul);
+
+    if (width == 4) {
+      StoreLo4(dst, val);
+    } else {
+      vst1_u8(dst, val);
+    }
+    dst += stride;
+  } while (++y < height);
+}
+
+// Process 4 or 8 |width| by any |height|.
+template <int width>
+inline void DirectionalZone1Blend_WxH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep, const int upsample_shift) {
+  assert(width == 4 || width == 8);
+
+  const int scale_bits_x = 6 - upsample_shift;
+
+  const uint8x8_t all = vcreate_u8(0x0706050403020100);
+  const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+  const uint8x8_t base_step = upsample_shift ? even : all;
+  const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1));
+
+  int y = 0;
+  do {
+    const uint8_t* const src = top_row + (top_x >> scale_bits_x);
+    uint8x8_t left, right;
+    LoadStepwise(src, base_step, right_step, &left, &right);
+
+    const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint8x8_t val = WeightedBlend(left, right, shift);
+
+    uint8x8_t dst_blend = vld1_u8(dest);
+    // |zone_bounds| values can be negative.
+    uint8x8_t blend =
+        vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6)));
+    uint8x8_t output = vbsl_u8(blend, val, dst_blend);
+
+    if (width == 4) {
+      StoreLo4(dest, output);
+    } else {
+      vst1_u8(dest, output);
+    }
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (++y < height);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep, const bool upsampled_top,
+    const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
+
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  // TODO(johannkoenig): Revisit this for |width| == 4.
+  const int max_shuffle_height =
+      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  if (min_top_only_x > 0) {
+    // Round down to the nearest multiple of 8.
+    // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
+    const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
+                            upsampled_top);
+
+    if (max_top_only_y == height) return;
+
+    int y = max_top_only_y;
+    dst += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min((4 << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    // +8 increment is OK because if height is 4 this only goes once.
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          left_y, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+                                   xstep_bounds, top_x, xstep,
+                                   upsample_top_shift);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+    for (; y < min_left_only_y;
+         y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          base_left_y, -ystep, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
+                                   xstep_bounds, top_x, xstep,
+                                   upsample_top_shift);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst += stride8) {
+      DirectionalZone3_WxH<4>(
+          dst, stride, min_height,
+          left_column + ((y - left_base_increment) << upsample_left_shift),
+          base_left_y, -ystep, upsample_left_shift);
+    }
+  } else {
+    DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep,
+                            upsampled_top);
+  }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_8(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint8_t* LIBGAV1_RESTRICT const top_row,
+    const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop incrementers for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const int ystep8 = ystep << 3;
+
+  // Process Wx4 blocks.
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+  int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  int x = 0;
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling. |d| represents the number of pixels that can
+  // fit in one contiguous vector when stepping by |ystep|. For a given x
+  // position, the left column values can be obtained by VTBL as long as the
+  // values at row[x + d] and beyond come from the top row. However, this does
+  // not guarantee that the vector will also contain all of the values needed
+  // from top row.
+  const int d = 16 / ((ystep >> 6) + 1);
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+    const int max_shuffle_height =
+        std::min(((x + d) << 6) / xstep, height) & ~7;
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+                            top_row + (x << upsample_top_shift), -xstep,
+                            upsampled_top);
+
+    if (max_top_only_y == height) continue;
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<8>(
+          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+          xstep_bounds, top_x, xstep, upsample_top_shift);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+
+      DirectionalZone1Blend_WxH<8>(
+          dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+          xstep_bounds, top_x, xstep, upsample_top_shift);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_WxH<8>(
+          dst_x, stride, min_height,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep, upsample_left_shift);
+    }
+  }
+  // TODO(johannkoenig): May be able to remove this branch.
+  if (x < width) {
+    DirectionalZone1_WxH(dst + x, stride, width - x, height,
+                         top_row + (x << upsample_top_shift), -xstep,
+                         upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+#if LIBGAV1_MSAN
+  memset(top_buffer, 0, sizeof(top_buffer));
+  memset(left_buffer, 0, sizeof(left_buffer));
+#endif  // LIBGAV1_MSAN
+
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
+                         upsampled_top, upsampled_left);
+  } else {
+    DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
+                       ystep, upsampled_top, upsampled_left);
+  }
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+
+  if (width == 4 || height == 4) {
+    // This block can handle all sizes but the specializations for other sizes
+    // are faster.
+    const uint8x8_t all = vcreate_u8(0x0706050403020100);
+    const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200);
+    const uint8x8_t base_step_v = upsampled_left ? even : all;
+    const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1));
+
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        auto* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        uint8x8_t left_v[4], right_v[4], value_v[4];
+        const int ystep_base = ystep * x;
+        const int offset = y * base_step;
+
+        const int index_0 = ystep_base + ystep * 1;
+        LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v,
+                     right_step, &left_v[0], &right_v[0]);
+        value_v[0] = WeightedBlend(left_v[0], right_v[0],
+                                   ((index_0 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_1 = ystep_base + ystep * 2;
+        LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v,
+                     right_step, &left_v[1], &right_v[1]);
+        value_v[1] = WeightedBlend(left_v[1], right_v[1],
+                                   ((index_1 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_2 = ystep_base + ystep * 3;
+        LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v,
+                     right_step, &left_v[2], &right_v[2]);
+        value_v[2] = WeightedBlend(left_v[2], right_v[2],
+                                   ((index_2 << upsample_shift) & 0x3F) >> 1);
+
+        const int index_3 = ystep_base + ystep * 4;
+        LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v,
+                     right_step, &left_v[3], &right_v[3]);
+        value_v[3] = WeightedBlend(left_v[3], right_v[3],
+                                   ((index_3 << upsample_shift) & 0x3F) >> 1);
+
+        // 8x4 transpose.
+        const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]);
+        const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]);
+
+        const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]),
+                                         vreinterpret_u16_u8(b1.val[0]));
+        const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]),
+                                         vreinterpret_u16_u8(b1.val[1]));
+
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[0]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c0.val[1]));
+        dst += stride;
+        StoreLo4(dst, vreinterpret_u8_u16(c1.val[1]));
+
+        if (height > 4) {
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[0]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c0.val[1]));
+          dst += stride;
+          StoreHi4(dst, vreinterpret_u8_u16(c1.val[1]));
+        }
+        x += 4;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  } else {  // 8x8 at a time.
+    // Limited improvement for 8x8. ~20% faster for 64x64.
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        auto* dst = static_cast<uint8_t*>(dest);
+        dst += y * stride + x;
+        const int ystep_base = ystep * (x + 1);
+
+        DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift),
+                                ystep_base, ystep, upsample_shift);
+        x += 8;
+      } while (x < width);
+      y += 8;
+    } while (y < height);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const int a_weight, const int b_weight) {
+  const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+  const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16_t a_weight,
+                                const uint16_t b_weight) {
+  const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2_u16(source);
+  } else {
+    dest->val[0] = vld1_u16(source);
+    dest->val[1] = vld1_u16(source + 1);
+  }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest,
+                         const uint16_t* LIBGAV1_RESTRICT const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2q_u16(source);
+  } else {
+    dest->val[0] = vld1q_u16(source);
+    dest->val[1] = vld1q_u16(source + 1);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (4 + height - 1) << upsample_shift;
+  const int16x4_t max_base = vdup_n_s16(max_base_x);
+  const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+  const int16x4_t index_offset = {0, 1, 2, 3};
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+    const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+    uint16x4x2_t sampled_top_row;
+    LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+    const uint16x4_t combined = WeightedBlend(
+        sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+    // If |upsampled| is true then extract every other value for output.
+    const uint16x4_t masked_result =
+        vbsl_u16(max_base_mask, combined, final_top_val);
+
+    vst1_u16(dst, masked_result);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], 4 /* width */);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const top,
+                                 const int xstep) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    do {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+
+      base_x = vaddq_s16(base_x, block_step);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+  for (int i = y; i < height; ++i) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t stride, const int width,
+                                   const int height,
+                                   const uint16_t* LIBGAV1_RESTRICT const top,
+                                   const int xstep, const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    int x = 0;
+    do {
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      vst1q_u16(dst + x, combined);
+
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+        ~7;
+    for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+                                  base_x = vaddq_s16(base_x, block_step)) {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+    }
+    // Corner-only section of the row.
+    Memset(dst + x, top[max_base_index], width - x);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+void DirectionalIntraPredictorZone1_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  auto* dst = static_cast<uint16_t*>(dest);
+  stride /= sizeof(top[0]);
+
+  assert(xstep > 0);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint16_t* top_ptr = top + 1;
+    const int width_bytes = width * sizeof(top[0]);
+    int y = height;
+    do {
+      memcpy(dst, top_ptr, width_bytes);
+      memcpy(dst + stride, top_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+  } else {
+    if (width == 4) {
+      if (upsampled_top) {
+        DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+      } else {
+        DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+      }
+    } else if (width >= 32) {
+      if (upsampled_top) {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+      } else {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+      }
+    } else if (upsampled_top) {
+      DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+    } else {
+      DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30             00 01 02 03
+// 01 11 21 31             10 11 12 13
+// 02 12 22 32             20 21 22 23
+// 03 13 23 33             30 31 32 33
+// -----------     -->     -----------
+// 40 50 60 70             40 41 42 43
+// 41 51 61 71             50 51 52 53
+// 42 52 62 72             60 61 62 63
+// 43 53 63 73             70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x4_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x4x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x4(result);
+  Store4(dst, result[0]);
+  dst += stride;
+  Store4(dst, result[1]);
+  dst += stride;
+  Store4(dst, result[2]);
+  dst += stride;
+  Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  int y = 0;
+  do {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+                                    ystep);
+    dest += 4 * stride;
+    y += 4;
+  } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  int x = 0;
+  int base_left_y = 0;
+  do {
+    // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+    // 8x4 and 16x4.
+    DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+                                    base_left_y);
+    base_left_y += 4 * ystep;
+    x += 4;
+  } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep, const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[8];
+
+  int left_y = base_left_y + ystep;
+  uint16x8x2_t sampled_left_col;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose8x8(result);
+  Store8(dest, result[0]);
+  dest += stride;
+  Store8(dest, result[1]);
+  dest += stride;
+  Store8(dest, result[2]);
+  dest += stride;
+  Store8(dest, result[3]);
+  dest += stride;
+  Store8(dest, result[4]);
+  dest += stride;
+  Store8(dest, result[5]);
+  dest += stride;
+  Store8(dest, result[6]);
+  dest += stride;
+  Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const ptrdiff_t stride, const int width,
+                                 const int height,
+                                 const uint16_t* LIBGAV1_RESTRICT const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> (6 - upsample_shift)) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+  int y = 0;
+  do {
+    int x = 0;
+    uint8_t* dst_x = dest + y * stride;
+    do {
+      const int base_left_y = ystep * x;
+      DirectionalZone3_8x8<upsampled>(
+          dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+      dst_x += 8 * sizeof(uint16_t);
+      x += 8;
+    } while (x < width);
+    y += 8;
+  } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (ystep == 64) {
+    assert(!upsampled_left);
+    const int width_bytes = width * sizeof(left[0]);
+    int y = height;
+    do {
+      const uint16_t* left_ptr = left + 1;
+      memcpy(dst, left_ptr, width_bytes);
+      memcpy(dst + stride, left_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      left_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+  if (width == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+    } else {
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+    }
+  } else if (height == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+    } else {
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+    }
+  } else {
+    if (upsampled_left) {
+      // |upsampled_left| can only be true if |width| + |height| <= 16,
+      // therefore this is 8x8.
+      DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+    } else {
+      DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone2
+// This function deals with cases not found in zone 1 or zone 3. The extreme
+// angles are 93, which makes for sharp ascents along |left_column| with each
+// successive dest row element until reaching |top_row|, and 177, with a shallow
+// ascent up |left_column| until reaching large jumps along |top_row|. In the
+// extremely steep cases, source vectors can only be loaded one lane at a time.
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step, const uint8x8_t right_step,
+                         uint16x4_t* left, uint16x4_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
+  *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
+}
+
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+                         const uint8x8_t left_step_0,
+                         const uint8x8_t right_step_0,
+                         const uint8x8_t left_step_1,
+                         const uint8x8_t right_step_1, uint16x8_t* left,
+                         uint16x8_t* right) {
+  const uint8x16x2_t mixed = {
+      vld1q_u8(static_cast<const uint8_t*>(source)),
+      vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+  const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
+  const uint16x4_t left_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
+  *left = vcombine_u16(left_low, left_high);
+  const uint16x4_t right_low =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
+  const uint16x4_t right_high =
+      vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
+  *right = vcombine_u16(right_low, right_high);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const uint16x4_t a_weight,
+                                const uint16x4_t b_weight) {
+  const uint16x4_t a_product = vmul_u16(a, a_weight);
+  const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16x8_t a_weight,
+                                const uint16x8_t b_weight) {
+  const uint16x8_t a_product = vmulq_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative in localized functions.
+// This is accommodated by making sure the relative indices are within [-15, 0]
+// when the function is called, and sliding them into the inclusive range
+// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
+// the byte offset for table lookups.
+
+constexpr int kPositiveIndexOffsetPixels = 15;
+constexpr int kPositiveIndexOffsetBytes = 30;
+
+inline void DirectionalZone2FromLeftCol_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x4_t offset_y;
+  int16x4_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsampled) {
+    offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
+    shift_upsampled = vshl_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshr_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshl_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the
+  // right. With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by
+  // doing separate loads for |left_values| and |right_values|. vtbl
+  // supports 2 Q registers as input which would allow for cumulative
+  // offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x4_t sampler_0 =
+      vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
+  const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
+  const uint8x8_t left_indices =
+      vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
+  const uint8x8_t right_indices =
+      vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
+  const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
+  const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
+
+  int y = 0;
+  do {
+    uint16x4_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_indices, right_indices, &src_left, &src_right);
+    const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store4(dst, val);
+    dst += stride;
+  } while (++y < height);
+}
+
+inline void DirectionalZone2FromLeftCol_8xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+    const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+
+  const int index_scale_bits = 6;
+  // The values in |offset_y| are negative, except for the first element, which
+  // is zero.
+  int16x8_t offset_y = left_y;
+  int16x8_t shift_upsampled = left_y;
+  // The shift argument must be a constant, otherwise use upsample_shift
+  // directly.
+  if (upsampled) {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
+    shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+  } else {
+    offset_y = vshrq_n_s16(left_y, index_scale_bits);
+  }
+  offset_y = vshlq_n_s16(offset_y, 1);
+
+  // Select values to the left of the starting point.
+  // The 15th element (and 16th) will be all the way at the end, to the right.
+  // With a negative ystep everything else will be "left" of them.
+  // This supports cumulative steps up to 15. We could support up to 16 by doing
+  // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+  // registers as input which would allow for cumulative offsets of 32.
+  // |sampler_0| indexes the first byte of each 16-bit value.
+  const int16x8_t sampler_0 =
+      vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
+  // |sampler_1| indexes the second byte of each 16-bit value.
+  const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
+  const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
+  const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
+  const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
+  const uint8x8_t right_values_0 =
+      vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
+  const uint8x8_t right_values_1 =
+      vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
+
+  const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+  const uint16x8_t shift_0 =
+      vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
+  const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
+
+  int y = 0;
+  do {
+    uint16x8_t src_left, src_right;
+    LoadStepwise(
+        left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+        left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
+        &src_right);
+    const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+    Store8(dst, val);
+    dst += stride;
+  } while (++y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_4xH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x4_t indices = {0, 1, 2, 3};
+
+  uint16x4x2_t top_vals;
+  int y = height;
+  do {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x4_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x4_t dst_blend = Load4U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
+    const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
+
+    Store4(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (--y != 0);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+    const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+    const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  // Representing positions along the row, which |zone_bounds| will target for
+  // the blending boundary.
+  const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  uint16x8x2_t top_vals;
+  int y = height;
+  do {
+    const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+    LoadEdgeVals(&top_vals, src, upsampled);
+
+    const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    const uint16x8_t val =
+        WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+    const uint16x8_t dst_blend = Load8U16(dest);
+    // |zone_bounds| values can be negative.
+    const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
+    const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
+
+    Store8(dest, output);
+    dest += stride;
+    zone_bounds += xstep;
+    top_x -= xstep;
+  } while (--y != 0);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
+// that do not correspond to angle derivatives are left at zero.
+// Notably, in cases with upsampling, the shuffle-invalid height is always
+// greater than the prediction height (which is 8 at maximum).
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_4xH(
+    uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+    const int xstep, const int ystep) {
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+
+  // Helper vector for index computation.
+  const int16x4_t zero_to_three = {0, 1, 2, 3};
+
+  // Loop increments for moving by block (4xN). Vertical still steps by 8. If
+  // it's only 4, it will be finished in the first iteration.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+
+  const int min_height = (height == 4) ? 4 : 8;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. The following values need the full ystep as a relative offset.
+  const int16x4_t left_y =
+      vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
+
+  // This loop treats the 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  // Round down to the nearest multiple of 8.
+  // TODO(petersonab): Check if rounding to the nearest 4 is okay.
+  const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+  DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
+                                      stride >> 1, max_top_only_y, top_row,
+                                      -xstep);
+
+  if (max_top_only_y == height) return;
+
+  int y = max_top_only_y;
+  dst += stride * y;
+  const int xstep_y = xstep * y;
+
+  // All rows from |min_left_only_y| down for this set of columns only need
+  // |left_column| to compute.
+  const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
+  int xstep_bounds = xstep_bounds_base + xstep_y;
+  int top_x = -xstep - xstep_y;
+
+  // +8 increment is OK because if height is 4 this only runs once.
+  for (; y < min_left_only_y;
+       y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+    DirectionalZone2FromLeftCol_4xH(
+        dst, stride, min_height,
+        left_column + ((y - left_base_increment) << upsample_left_shift),
+        left_y, upsampled_left);
+
+    DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
+                                             xstep_bounds, top_x, xstep);
+  }
+
+  // Loop over y for left-only rows.
+  for (; y < height; y += 8, dst += stride8) {
+    // Angle expected by Zone3 is flipped about the 180 degree vector, which
+    // is the x-axis.
+    DirectionalZone3_4xH<upsampled_left>(
+        dst, stride, min_height, left_column + (y << upsample_left_shift),
+        -ystep);
+  }
+}
+
+// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
+// address safety.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_Wx4(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int xstep, const int ystep) {
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int min_top_only_x = std::min((4 * xstep) >> 6, width);
+  int x = 0;
+  for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
+    uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+    // Round down to the nearest multiple of 4.
+    const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
+    if (max_top_only_y != 0) {
+      DirectionalZone1_4xH<upsampled_top>(
+          reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
+          top_row + (x << upsample_top_shift), -xstep);
+      continue;
+    }
+
+    DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
+                                         -ystep * x);
+
+    const int min_left_only_y = ((x + 4) << 6) / xstep;
+    if (min_left_only_y != 0) {
+      const int top_x = -xstep;
+      DirectionalZone1Blend_4xH<upsampled_top>(
+          dst_x, stride, 4, top_row + (x << upsample_top_shift),
+          xstep_bounds_base, top_x, xstep);
+    }
+  }
+  // Reached |min_top_only_x|.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+// Process a multiple of 8 |width|.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8(
+    uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+    const uint16_t* LIBGAV1_RESTRICT const top_row,
+    const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep) {
+  if (height == 4) {
+    DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
+        dst, stride, top_row, left_column, width, xstep, ystep);
+    return;
+  }
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+  // Helper vector.
+  const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  // Loop increments for moving by block (8x8). This function handles blocks
+  // with height 4 as well. They are calculated in one pass so these variables
+  // do not get used.
+  const ptrdiff_t stride8 = stride << 3;
+  const int xstep8 = xstep << 3;
+  const int ystep8 = ystep << 3;
+
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+  // at least 3.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from |left_column| may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  int16x8_t left_y =
+      vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  int x = 0;
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+           xstep_bounds_base -= (8 << 6),
+           left_y = vsubq_s16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_WxH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+        top_row + (x << upsample_top_shift), -xstep);
+
+    if (max_top_only_y == height) continue;
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+
+    // All rows from |min_left_only_y| down for this set of columns only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    int xstep_bounds = xstep_bounds_base + xstep_y;
+    int top_x = -xstep - xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8xH(
+          dst_x, stride, 8,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y,
+          upsample_left_shift);
+
+      DirectionalZone1Blend_8xH<upsampled_top>(
+          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+          top_x, xstep);
+    }
+
+    // Pick up from the last y-value, using the slower but secure method for
+    // left prediction.
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+
+      DirectionalZone1Blend_8xH<upsampled_top>(
+          dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+          top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8x8<upsampled_left>(
+          dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+          -ystep * x);
+    }
+  }
+  // Reached |min_top_only_x|.
+  if (x < width) {
+    DirectionalZone1_WxH<upsampled_top>(
+        reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
+        top_row + (x << upsample_top_shift), -xstep);
+  }
+}
+
+// At this angle, neither edges are upsampled.
+// |min_width| is either 4 or 8.
+template <int min_width>
+void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+                         const uint16_t* LIBGAV1_RESTRICT const top,
+                         const uint16_t* LIBGAV1_RESTRICT const left,
+                         const int width, const int height) {
+  // y = 0 is more trivial than the other rows.
+  memcpy(dst, top - 1, width * sizeof(top[0]));
+  dst += stride;
+
+  // If |height| > |width|, then there is a point at which top_row is no longer
+  // used in each row.
+  const int min_left_only_y = std::min(width, height);
+
+  int y = 1;
+  do {
+    // Example: If y is 4 (min_width), the dest row starts with left[3],
+    // left[2], left[1], left[0], because the angle points up. Therefore, load
+    // starts at left[0] and is then reversed. If y is 2, the load starts at
+    // left[-2], and is reversed to store left[1], left[0], with negative values
+    // overwritten from |top_row|.
+    const uint16_t* const load_left = left + y - min_width;
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+
+    // Some values will be overwritten when |y| is not a multiple of
+    // |min_width|.
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
+      vst1_u16(dst16, left_toward_corner);
+    } else {
+      int x = 0;
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < y);
+    }
+    // Entering |top|.
+    memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
+    dst += stride;
+  } while (++y < min_left_only_y);
+
+  // Left only.
+  for (; y < height; ++y, dst += stride) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t* const load_left = left + y - min_width;
+
+    int x = 0;
+    if (min_width == 4) {
+      const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
+      vst1_u16(dst16 + x, left_toward_corner);
+    } else {
+      do {
+        const uint16x8_t left_toward_corner =
+            vrev64q_u16(vld1q_u16(load_left - x));
+        vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+        vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+        x += 8;
+      } while (x < width);
+    }
+  }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+    void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint16_t top_buffer[288];
+  uint16_t left_buffer[288];
+#if LIBGAV1_MSAN
+  memset(top_buffer, 0, sizeof(top_buffer));
+  memset(left_buffer, 0, sizeof(left_buffer));
+#endif  // LIBGAV1_MSAN
+  memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
+         160);
+  const uint16_t* top_ptr = top_buffer + 144;
+  const uint16_t* left_ptr = left_buffer + 144;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  if (width == 4) {
+    if (xstep == 64) {
+      assert(ystep == 64);
+      DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
+      return;
+    }
+    if (upsampled_top) {
+      if (upsampled_left) {
+        DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+      } else {
+        DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
+                                          height, xstep, ystep);
+      }
+    } else if (upsampled_left) {
+      DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
+                                        xstep, ystep);
+    } else {
+      DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
+                                         xstep, ystep);
+    }
+    return;
+  }
+
+  if (xstep == 64) {
+    assert(ystep == 64);
+    DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
+    return;
+  }
+  if (upsampled_top) {
+    if (upsampled_left) {
+      DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
+                                     height, xstep, ystep);
+    } else {
+      DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
+                                      height, xstep, ystep);
+    }
+  } else if (upsampled_left) {
+    DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
+                                    height, xstep, ystep);
+  } else {
+    DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
+                                     height, xstep, ystep);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.h b/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..310d90b
--- /dev/null
+++ b/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/src/dsp/arm/intrapred_filter_neon.cc b/src/dsp/arm/intrapred_filter_neon.cc
new file mode 100644
index 0000000..70bd62b
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.cc
@@ -0,0 +1,306 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Transpose kFilterIntraTaps and convert the first row to unsigned values.
+//
+// With the previous orientation we were able to multiply all the input values
+// by a single tap. This required that all the input values be in one vector
+// which requires expensive set up operations (shifts, vext, vtbl). All the
+// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but
+// then the shifting, rounding, and clamping was done in GP registers.
+//
+// Switching to unsigned values allows multiplying the 8 bit inputs directly.
+// When one value was negative we needed to vmovl_u8 first so that the results
+// maintained the proper sign.
+//
+// We take this into account when summing the values by subtracting the product
+// of the first row.
+alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] =
+    {{{6, 5, 3, 3, 4, 3, 3, 3},  // Original values are negative.
+      {10, 2, 1, 1, 6, 2, 2, 1},
+      {0, 10, 1, 1, 0, 6, 2, 2},
+      {0, 0, 10, 2, 0, 0, 6, 2},
+      {0, 0, 0, 10, 0, 0, 0, 6},
+      {12, 9, 7, 5, 2, 2, 2, 3},
+      {0, 0, 0, 0, 12, 9, 7, 5}},
+     {{10, 6, 4, 2, 10, 6, 4, 2},  // Original values are negative.
+      {16, 0, 0, 0, 16, 0, 0, 0},
+      {0, 16, 0, 0, 0, 16, 0, 0},
+      {0, 0, 16, 0, 0, 0, 16, 0},
+      {0, 0, 0, 16, 0, 0, 0, 16},
+      {10, 6, 4, 2, 0, 0, 0, 0},
+      {0, 0, 0, 0, 10, 6, 4, 2}},
+     {{8, 8, 8, 8, 4, 4, 4, 4},  // Original values are negative.
+      {8, 0, 0, 0, 4, 0, 0, 0},
+      {0, 8, 0, 0, 0, 4, 0, 0},
+      {0, 0, 8, 0, 0, 0, 4, 0},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {16, 16, 16, 16, 0, 0, 0, 0},
+      {0, 0, 0, 0, 16, 16, 16, 16}},
+     {{2, 1, 1, 0, 1, 1, 1, 1},  // Original values are negative.
+      {8, 3, 2, 1, 4, 3, 2, 2},
+      {0, 8, 3, 2, 0, 4, 3, 2},
+      {0, 0, 8, 3, 0, 0, 4, 3},
+      {0, 0, 0, 8, 0, 0, 0, 4},
+      {10, 6, 4, 2, 3, 4, 4, 3},
+      {0, 0, 0, 0, 10, 6, 4, 3}},
+     {{12, 10, 9, 8, 10, 9, 8, 7},  // Original values are negative.
+      {14, 0, 0, 0, 12, 1, 0, 0},
+      {0, 14, 0, 0, 0, 12, 0, 0},
+      {0, 0, 14, 0, 0, 0, 12, 1},
+      {0, 0, 0, 14, 0, 0, 0, 12},
+      {14, 12, 11, 10, 0, 0, 1, 1},
+      {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column,
+                               FilterIntraPredictor pred, int width,
+                               int height) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t transposed_taps[7];
+  for (int i = 0; i < 7; ++i) {
+    transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]);
+  }
+
+  uint8_t relative_top_left = top[-1];
+  const uint8_t* relative_top = top;
+  uint8_t relative_left[2] = {left[0], left[1]};
+
+  int y = 0;
+  do {
+    uint8_t* row_dst = dst;
+    int x = 0;
+    do {
+      uint16x8_t sum = vdupq_n_u16(0);
+      const uint16x8_t subtrahend =
+          vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left));
+      for (int i = 1; i < 5; ++i) {
+        sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1]));
+      }
+      for (int i = 5; i < 7; ++i) {
+        sum =
+            vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5]));
+      }
+
+      const int16x8_t sum_signed =
+          vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend));
+      const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4);
+
+      uint8x8_t sum_saturated = vqmovun_s16(sum_shifted);
+
+      StoreLo4(row_dst, sum_saturated);
+      StoreHi4(row_dst + stride, sum_saturated);
+
+      // Progress across
+      relative_top_left = relative_top[3];
+      relative_top += 4;
+      relative_left[0] = row_dst[3];
+      relative_left[1] = row_dst[3 + stride];
+      row_dst += 4;
+      x += 4;
+    } while (x < width);
+
+    // Progress down.
+    relative_top_left = left[y + 1];
+    relative_top = dst + stride;
+    relative_left[0] = left[y + 2];
+    relative_left[1] = left[y + 3];
+
+    dst += 2 * stride;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+alignas(kMaxAlignment) constexpr int16_t
+    kTransposedTaps[kNumFilterIntraPredictors][7][8] = {
+        {{-6, -5, -3, -3, -4, -3, -3, -3},
+         {10, 2, 1, 1, 6, 2, 2, 1},
+         {0, 10, 1, 1, 0, 6, 2, 2},
+         {0, 0, 10, 2, 0, 0, 6, 2},
+         {0, 0, 0, 10, 0, 0, 0, 6},
+         {12, 9, 7, 5, 2, 2, 2, 3},
+         {0, 0, 0, 0, 12, 9, 7, 5}},
+        {{-10, -6, -4, -2, -10, -6, -4, -2},
+         {16, 0, 0, 0, 16, 0, 0, 0},
+         {0, 16, 0, 0, 0, 16, 0, 0},
+         {0, 0, 16, 0, 0, 0, 16, 0},
+         {0, 0, 0, 16, 0, 0, 0, 16},
+         {10, 6, 4, 2, 0, 0, 0, 0},
+         {0, 0, 0, 0, 10, 6, 4, 2}},
+        {{-8, -8, -8, -8, -4, -4, -4, -4},
+         {8, 0, 0, 0, 4, 0, 0, 0},
+         {0, 8, 0, 0, 0, 4, 0, 0},
+         {0, 0, 8, 0, 0, 0, 4, 0},
+         {0, 0, 0, 8, 0, 0, 0, 4},
+         {16, 16, 16, 16, 0, 0, 0, 0},
+         {0, 0, 0, 0, 16, 16, 16, 16}},
+        {{-2, -1, -1, -0, -1, -1, -1, -1},
+         {8, 3, 2, 1, 4, 3, 2, 2},
+         {0, 8, 3, 2, 0, 4, 3, 2},
+         {0, 0, 8, 3, 0, 0, 4, 3},
+         {0, 0, 0, 8, 0, 0, 0, 4},
+         {10, 6, 4, 2, 3, 4, 4, 3},
+         {0, 0, 0, 0, 10, 6, 4, 3}},
+        {{-12, -10, -9, -8, -10, -9, -8, -7},
+         {14, 0, 0, 0, 12, 1, 0, 0},
+         {0, 14, 0, 0, 0, 12, 0, 0},
+         {0, 0, 14, 0, 0, 0, 12, 1},
+         {0, 0, 0, 14, 0, 0, 0, 12},
+         {14, 12, 11, 10, 0, 0, 1, 1},
+         {0, 0, 0, 0, 14, 12, 11, 9}}};
+
+void FilterIntraPredictor_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column,
+                               FilterIntraPredictor pred, int width,
+                               int height) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  auto* dst = static_cast<uint16_t*>(dest);
+
+  stride >>= 1;
+
+  int16x8_t transposed_taps[7];
+  for (int i = 0; i < 7; ++i) {
+    transposed_taps[i] = vld1q_s16(kTransposedTaps[pred][i]);
+  }
+
+  uint16_t relative_top_left = top[-1];
+  const uint16_t* relative_top = top;
+  uint16_t relative_left[2] = {left[0], left[1]};
+
+  int y = 0;
+  do {
+    uint16_t* row_dst = dst;
+    int x = 0;
+    do {
+      int16x8_t sum =
+          vmulq_s16(transposed_taps[0],
+                    vreinterpretq_s16_u16(vdupq_n_u16(relative_top_left)));
+      for (int i = 1; i < 5; ++i) {
+        sum =
+            vmlaq_s16(sum, transposed_taps[i],
+                      vreinterpretq_s16_u16(vdupq_n_u16(relative_top[i - 1])));
+      }
+      for (int i = 5; i < 7; ++i) {
+        sum =
+            vmlaq_s16(sum, transposed_taps[i],
+                      vreinterpretq_s16_u16(vdupq_n_u16(relative_left[i - 5])));
+      }
+
+      const int16x8_t sum_shifted = vrshrq_n_s16(sum, 4);
+      const uint16x8_t sum_saturated = vminq_u16(
+          vreinterpretq_u16_s16(vmaxq_s16(sum_shifted, vdupq_n_s16(0))),
+          vdupq_n_u16((1 << kBitdepth10) - 1));
+
+      vst1_u16(row_dst, vget_low_u16(sum_saturated));
+      vst1_u16(row_dst + stride, vget_high_u16(sum_saturated));
+
+      // Progress across
+      relative_top_left = relative_top[3];
+      relative_top += 4;
+      relative_left[0] = row_dst[3];
+      relative_left[1] = row_dst[3 + stride];
+      row_dst += 4;
+      x += 4;
+    } while (x < width);
+
+    // Progress down.
+    relative_top_left = left[y + 1];
+    relative_top = dst + stride;
+    relative_left[0] = left[y + 2];
+    relative_left[1] = left[y + 3];
+
+    dst += 2 * stride;
+    y += 2;
+  } while (y < height);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->filter_intra_predictor = FilterIntraPredictor_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredFilterInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_filter_neon.h b/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..d005f4c
--- /dev/null
+++ b/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
new file mode 100644
index 0000000..cd47a22
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -0,0 +1,1626 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_NEON
+
+using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
+                                 const bool use_ref_1, const void* ref_1,
+                                 const int ref_1_size_log2);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
+
+// DC intra-predictors for square blocks.
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+struct DcPredFuncs_NEON {
+  DcPredFuncs_NEON() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+          const void* LIBGAV1_RESTRICT const top_row,
+          const void* /*left_column*/) {
+  const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
+    DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+           const void* /*top_row*/,
+           const void* LIBGAV1_RESTRICT const left_column) {
+  const uint32x2_t sum =
+      sumfn(left_column, block_height_log2, false, nullptr, 0);
+  const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
+          DcStoreFunc storefn>
+void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const uint32x2_t sum =
+      sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
+  if (block_width_log2 == block_height_log2) {
+    const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    // TODO(johannkoenig): Compare this to mul/shift in vectors.
+    const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
+    uint32_t dc = vget_lane_u32(sum, 0);
+    dc += divisor >> 1;
+    dc /= divisor;
+    storefn(dest, stride, vdup_n_u32(dc));
+  }
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x4_t val) {
+  const uint32x2_t sum = vpaddl_u16(val);
+  return vpadd_u32(sum, sum);
+}
+
+// Sum all the elements in the vector into the low 32 bits.
+inline uint32x2_t Sum(const uint16x8_t val) {
+  const uint32x4_t sum_0 = vpaddlq_u16(val);
+  const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
+  return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
+                  vget_high_u32(vreinterpretq_u32_u64(sum_1)));
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
+// entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
+  const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+  const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
+// the entire vector.
+inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
+                      const uint8x16_t val_2, const uint8x16_t val_3) {
+  const uint16x8_t sum_0 = Add(val_0, val_1);
+  const uint16x8_t sum_1 = Add(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 32 uint8_t values.
+inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  return Add(val_0, val_1);
+}
+
+// Load and combine 64 uint8_t values.
+inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
+  const uint8x16_t val_0 = vld1q_u8(buf);
+  const uint8x16_t val_1 = vld1q_u8(buf + 16);
+  const uint8x16_t val_2 = vld1q_u8(buf + 32);
+  const uint8x16_t val_3 = vld1q_u8(buf + 48);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
+// uint32_t.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
+                             const int ref_1_size_log2) {
+  const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
+  const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    uint8x8_t val = Load4(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 4x4
+          val = Load4<1>(ref_1_u8, val);
+          return Sum(vpaddl_u8(val));
+        }
+        case 3: {  // 4x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 4: {  // 4x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
+        }
+      }
+    }
+    // 4x1
+    const uint16x4_t sum = vpaddl_u8(val);
+    return vpaddl_u16(sum);
+  }
+  if (ref_0_size_log2 == 3) {
+    const uint8x8_t val_0 = vld1_u8(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 8x4
+          const uint8x8_t val_1 = Load4(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val_0);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 3: {  // 8x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          const uint16x4_t sum_0 = vpaddl_u8(val_0);
+          const uint16x4_t sum_1 = vpaddl_u8(val_1);
+          return Sum(vadd_u16(sum_0, sum_1));
+        }
+        case 4: {  // 8x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
+        }
+        case 5: {  // 8x32
+          return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
+        }
+      }
+    }
+    // 8x1
+    return Sum(vpaddl_u8(val_0));
+  }
+  if (ref_0_size_log2 == 4) {
+    const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 16x4
+          const uint8x8_t val_1 = Load4(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+        }
+        case 3: {  // 16x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
+        }
+        case 4: {  // 16x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          return Sum(Add(val_0, val_1));
+        }
+        case 5: {  // 16x32
+          const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 16x64
+          const uint16x8_t sum_0 = vpaddlq_u8(val_0);
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 16x1
+    return Sum(vpaddlq_u8(val_0));
+  }
+  if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 3: {  // 32x8
+          const uint8x8_t val_1 = vld1_u8(ref_1_u8);
+          return Sum(vaddw_u8(sum_0, val_1));
+        }
+        case 4: {  // 32x16
+          const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+          const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 32x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 32x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
+  if (use_ref_1) {
+    switch (ref_1_size_log2) {
+      case 4: {  // 64x16
+        const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
+        const uint16x8_t sum_1 = vpaddlq_u8(val_1);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 5: {  // 64x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 6: {  // 64x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      StoreLo4(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    StoreLo4(dst, vget_low_u8(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1_u8(dst, vget_low_u8(dc_dup));
+      dst += stride;
+    } while (--i != 0);
+    vst1_u8(dst, vget_low_u8(dc_dup));
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u8(dst, dc_dup);
+      vst1q_u8(dst + 16, dc_dup);
+      vst1q_u8(dst + 32, dc_dup);
+      vst1q_u8(dst + 48, dc_dup);
+      dst += stride;
+    } while (--i != 0);
+    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
+    vst1q_u8(dst + 32, dc_dup);
+    vst1q_u8(dst + 48, dc_dup);
+  }
+}
+
+template <int width, int height>
+inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const top_row,
+                             const void* LIBGAV1_RESTRICT const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x8_t top;
+  if (width == 4) {
+    top = Load4(top_row_u8);
+  } else {  // width == 8
+    top = vld1_u8(top_row_u8);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
+
+    const uint8x8_t left_dist = vabd_u8(top, top_left);
+    const uint8x8_t top_dist = vabd_u8(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+    const uint8x8_t left_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+    const uint8x8_t top_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint8x8_t result = vbsl_u8(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u8(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      StoreLo4(dest_u8, result);
+    } else {  // width == 8
+      vst1_u8(dest_u8, result);
+    }
+    dest_u8 += stride;
+  }
+}
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
+                             const uint16x8_t top_left_dist_low,
+                             const uint16x8_t top_left_dist_high) {
+  // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
+  // using movl(x_dist).
+  const uint8x8_t x_le_top_left_low =
+      vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
+  const uint8x8_t x_le_top_left_high =
+      vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
+  return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+}
+
+// Select the closest values and collect them.
+inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
+                              const uint8x16_t top_left,
+                              const uint8x16_t left_le_top,
+                              const uint8x16_t left_le_top_left,
+                              const uint8x16_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  uint8x16_t result = vbslq_u8(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num)                                              \
+  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
+      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
+      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num)                                  \
+  const uint8x16_t left_le_top_left_##num =                    \
+      XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
+                 top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num)                           \
+  const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
+      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+template <int width, int height>
+inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
+                               ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  auto* dest_u8 = static_cast<uint8_t*>(dest);
+  const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
+  const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
+
+  const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
+  uint8x16_t top[4];
+  top[0] = vld1q_u8(top_row_u8);
+  if (width > 16) {
+    top[1] = vld1q_u8(top_row_u8 + 16);
+    if (width == 64) {
+      top[2] = vld1q_u8(top_row_u8 + 32);
+      top[3] = vld1q_u8(top_row_u8 + 48);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
+
+    const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+    TOP_LEFT_DIST(0);
+    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+    LEFT_LE_TOP_LEFT(0);
+    TOP_LE_TOP_LEFT(0);
+
+    const uint8x16_t result_0 =
+        SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+                    top_le_top_left_0);
+    vst1q_u8(dest_u8, result_0);
+
+    if (width > 16) {
+      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+      TOP_LEFT_DIST(1);
+      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+      LEFT_LE_TOP_LEFT(1);
+      TOP_LE_TOP_LEFT(1);
+
+      const uint8x16_t result_1 =
+          SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
+                      top_le_top_left_1);
+      vst1q_u8(dest_u8 + 16, result_1);
+
+      if (width == 64) {
+        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+        TOP_LEFT_DIST(2);
+        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+        LEFT_LE_TOP_LEFT(2);
+        TOP_LE_TOP_LEFT(2);
+
+        const uint8x16_t result_2 =
+            SelectPaeth(top[2], left, top_left, left_2_le_top,
+                        left_le_top_left_2, top_le_top_left_2);
+        vst1q_u8(dest_u8 + 32, result_2);
+
+        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+        TOP_LEFT_DIST(3);
+        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+        LEFT_LE_TOP_LEFT(3);
+        TOP_LE_TOP_LEFT(3);
+
+        const uint8x16_t result_3 =
+            SelectPaeth(top[3], left, top_left, left_3_le_top,
+                        left_le_top_left_3, top_le_top_left_3);
+        vst1q_u8(dest_u8 + 48, result_3);
+      }
+    }
+
+    dest_u8 += stride;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Add the elements in the given vectors together but do not sum the entire
+// vector.
+inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
+                      const uint16x8_t val_2, const uint16x8_t val_3) {
+  const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
+  const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// Load and combine 16 uint16_t values.
+inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  return vaddq_u16(val_0, val_1);
+}
+
+// Load and combine 32 uint16_t values.
+inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  return Add(val_0, val_1, val_2, val_3);
+}
+
+// Load and combine 64 uint16_t values.
+inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
+  const uint16x8_t val_0 = vld1q_u16(buf);
+  const uint16x8_t val_1 = vld1q_u16(buf + 8);
+  const uint16x8_t val_2 = vld1q_u16(buf + 16);
+  const uint16x8_t val_3 = vld1q_u16(buf + 24);
+  const uint16x8_t val_4 = vld1q_u16(buf + 32);
+  const uint16x8_t val_5 = vld1q_u16(buf + 40);
+  const uint16x8_t val_6 = vld1q_u16(buf + 48);
+  const uint16x8_t val_7 = vld1q_u16(buf + 56);
+  const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
+  const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
+  return vaddq_u16(sum_0, sum_1);
+}
+
+// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
+// If |use_ref_1| is false then only sum |ref_0|.
+inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
+                             const int ref_0_size_log2, const bool use_ref_1,
+                             const void* LIBGAV1_RESTRICT ref_1,
+                             const int ref_1_size_log2) {
+  const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
+  const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
+  if (ref_0_size_log2 == 2) {
+    const uint16x4_t val_0 = vld1_u16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 4x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          return Sum(vadd_u16(val_0, val_1));
+        }
+        case 3: {  // 4x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 4x16
+          const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 4x1
+    return Sum(val_0);
+  }
+  if (ref_0_size_log2 == 3) {
+    const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 8x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+        case 3: {  // 8x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(val_0, val_1));
+        }
+        case 4: {  // 8x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+        case 5: {  // 8x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(val_0, sum_1));
+        }
+      }
+    }
+    // 8x1
+    return Sum(val_0);
+  }
+  if (ref_0_size_log2 == 4) {
+    const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 2: {  // 16x4
+          const uint16x4_t val_1 = vld1_u16(ref_1_u16);
+          const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 3: {  // 16x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 16x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 16x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 16x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 16x1
+    return Sum(sum_0);
+  }
+  if (ref_0_size_log2 == 5) {
+    const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
+    if (use_ref_1) {
+      switch (ref_1_size_log2) {
+        case 3: {  // 32x8
+          const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, val_1));
+        }
+        case 4: {  // 32x16
+          const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 5: {  // 32x32
+          const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+        case 6: {  // 32x64
+          const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+          return Sum(vaddq_u16(sum_0, sum_1));
+        }
+      }
+    }
+    // 32x1
+    return Sum(sum_0);
+  }
+
+  assert(ref_0_size_log2 == 6);
+  const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
+  if (use_ref_1) {
+    switch (ref_1_size_log2) {
+      case 4: {  // 64x16
+        const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 5: {  // 64x32
+        const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+      case 6: {  // 64x64
+        const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
+        return Sum(vaddq_u16(sum_0, sum_1));
+      }
+    }
+  }
+  // 64x1
+  return Sum(sum_0);
+}
+
+template <int width, int height>
+inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
+                         const uint32x2_t dc) {
+  auto* dest_u16 = static_cast<uint16_t*>(dest);
+  ptrdiff_t stride_u16 = stride >> 1;
+  const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
+  if (width == 4) {
+    int i = height - 1;
+    do {
+      vst1_u16(dest_u16, vget_low_u16(dc_dup));
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1_u16(dest_u16, vget_low_u16(dc_dup));
+  } else if (width == 8) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+  } else if (width == 16) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+  } else if (width == 32) {
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+  } else {
+    assert(width == 64);
+    int i = height - 1;
+    do {
+      vst1q_u16(dest_u16, dc_dup);
+      vst1q_u16(dest_u16 + 8, dc_dup);
+      vst1q_u16(dest_u16 + 16, dc_dup);
+      vst1q_u16(dest_u16 + 24, dc_dup);
+      vst1q_u16(dest_u16 + 32, dc_dup);
+      vst1q_u16(dest_u16 + 40, dc_dup);
+      vst1q_u16(dest_u16 + 48, dc_dup);
+      vst1q_u16(dest_u16 + 56, dc_dup);
+      dest_u16 += stride_u16;
+    } while (--i != 0);
+    vst1q_u16(dest_u16, dc_dup);
+    vst1q_u16(dest_u16 + 8, dc_dup);
+    vst1q_u16(dest_u16 + 16, dc_dup);
+    vst1q_u16(dest_u16 + 24, dc_dup);
+    vst1q_u16(dest_u16 + 32, dc_dup);
+    vst1q_u16(dest_u16 + 40, dc_dup);
+    vst1q_u16(dest_u16 + 48, dc_dup);
+    vst1q_u16(dest_u16 + 56, dc_dup);
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
+  using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
+  using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
+  using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
+  using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
+  using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
+  using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
+  using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
+  using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
+  using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
+  using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
+  using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
+  using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
+  using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
+  using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
+  using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
+  using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
+  using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
+  using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
+};
+
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t row = vld1_dup_u16(left + y);
+    vst1_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t row = vld1q_dup_u16(left + y);
+    vst1q_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    vst1q_u16(dst16 + 16, row0);
+    vst1q_u16(dst16 + 24, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    vst1q_u16(dst16 + 16, row1);
+    vst1q_u16(dst16 + 24, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x8_t row = vld1_u8(top);
+  int y = block_height;
+  do {
+    vst1_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row = vld1q_u8(top);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  const uint8x16_t row4 = vld1q_u8(top + 64);
+  const uint8x16_t row5 = vld1q_u8(top + 80);
+  const uint8x16_t row6 = vld1q_u8(top + 96);
+  const uint8x16_t row7 = vld1q_u8(top + 112);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int height>
+inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
+  const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
+  const uint16x4_t top = vld1_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t left = vdup_n_u16(left_col[y]);
+
+    const uint16x4_t left_dist = vabd_u16(top, top_left);
+    const uint16x4_t top_dist = vabd_u16(left, top_left);
+    const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
+
+    const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
+    const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
+    const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x4_t result = vbsl_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u16(left_or_top_mask, result, top_left);
+
+    vst1_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+  const uint16x8_t top = vld1q_u16(top_row);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+
+    const uint16x8_t left_dist = vabdq_u16(top, top_left);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x8_t result = vbslq_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbslq_u16(left_or_top_mask, result, top_left);
+
+    vst1q_u16(dst16, result);
+    dst += stride;
+  }
+}
+
+// For 16xH and above.
+template <int width, int height>
+inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                          const void* LIBGAV1_RESTRICT const top_ptr,
+                          const void* LIBGAV1_RESTRICT const left_ptr) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
+
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
+
+  uint16x8_t top[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    top[i] = vld1q_u16(top_row + (i << 3));
+  }
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t left = vdupq_n_u16(left_col[y]);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+
+    for (int i = 0; i < (width >> 3); ++i) {
+      const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
+      const uint16x8_t top_left_dist =
+          vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
+
+      const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+      const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+      const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+      // if (left_dist <= top_dist && left_dist <= top_left_dist)
+      const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+      //   dest[x] = left_column[y];
+      // Fill all the unused spaces with 'top'. They will be overwritten when
+      // the positions for top_left are known.
+      uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
+      // else if (top_dist <= top_left_dist)
+      //   dest[x] = top_row[x];
+      // Add these values to the mask. They were already set.
+      const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+      // else
+      //   dest[x] = top_left;
+      result = vbslq_u16(left_or_top_mask, result, top_left);
+
+      vst1q_u16(dst_x, result);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Vertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Vertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Vertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4xH_NEON<16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Vertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Vertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Vertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Vertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8xH_NEON<32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Vertical16xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Horizontal16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Vertical16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Vertical16xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Vertical16xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Vertical16xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Vertical32xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Vertical32xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Vertical32xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Horizontal32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Vertical32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Vertical64xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Vertical64xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Vertical64xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      PaethWxH_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h
new file mode 100644
index 0000000..5a56924
--- /dev/null
+++ b/src/dsp/arm/intrapred_neon.h
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
+void IntraPredInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 10 bit
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 16x64
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 32x64
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x16
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x32
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
+
+// 64x64
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
new file mode 100644
index 0000000..bcda131
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -0,0 +1,1166 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
+                                const uint16x4_t weighted_left,
+                                const uint16x4_t weighted_bl,
+                                const uint16x4_t weighted_tr) {
+  const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
+  const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
+  const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
+  return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+}
+
+template <int height>
+inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  constexpr int width = 4;
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_v = Load4(top);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
+  // 256 - weights = vneg_s8(weights)
+  const uint8x8_t scaled_weights_x =
+      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y =
+        vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v)));
+    const uint16x4_t weighted_bl =
+        vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v));
+
+    const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v));
+    const uint16x4_t weighted_left =
+        vget_low_u16(vmull_u8(weights_x_v, left_v));
+    const uint16x4_t weighted_tr =
+        vget_low_u16(vmull_u8(scaled_weights_x, top_right_v));
+    const uint16x4_t result =
+        CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+
+    StoreLo4(dst, vmovn_u16(vcombine_u16(result, result)));
+    dst += stride;
+  }
+}
+
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top,
+                               const uint16x8_t weighted_left,
+                               const uint16x8_t weighted_bl,
+                               const uint16x8_t weighted_tr) {
+  // Maximum value: 0xFF00
+  const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl);
+  // Maximum value: 0xFF00
+  const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr);
+  const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1);
+  return vrshrn_n_u16(pred_2, kSmoothWeightScale);
+}
+
+template <int height>
+inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  constexpr int width = 8;
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_v = vld1_u8(top);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
+  // 256 - weights = vneg_s8(weights)
+  const uint8x8_t scaled_weights_x =
+      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+    const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+    const uint8x8_t result =
+        CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+
+    vst1_u8(dst, result);
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+    const uint8x8_t weights_y, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_tr_low =
+      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_low = CalculatePred(
+      weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low);
+
+  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_tr_high =
+      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_high = CalculatePred(
+      weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high);
+
+  return vcombine_u8(result_low, result_high);
+}
+
+template <int width, int height>
+inline void Smooth16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
+  // This currently has a performance slope similar to Paeth so it does not
+  // appear to be register bound for arm64.
+  uint8x16_t weights_x_v[4];
+  weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] =
+      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+  if (width > 16) {
+    scaled_weights_x[1] =
+        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+    if (width == 64) {
+      scaled_weights_x[2] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
+      scaled_weights_x[3] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
+                                          weights_y_v, weights_x_v[0],
+                                          scaled_weights_x[0], weighted_bl));
+
+    if (width > 16) {
+      vst1q_u8(dst + 16, CalculateWeightsAndPred(
+                             top_v[1], left_v, top_right_v, weights_y_v,
+                             weights_x_v[1], scaled_weights_x[1], weighted_bl));
+      if (width == 64) {
+        vst1q_u8(dst + 32,
+                 CalculateWeightsAndPred(top_v[2], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[2],
+                                         scaled_weights_x[2], weighted_bl));
+        vst1q_u8(dst + 48,
+                 CalculateWeightsAndPred(top_v[3], left_v, top_right_v,
+                                         weights_y_v, weights_x_v[3],
+                                         scaled_weights_x[3], weighted_bl));
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+inline void SmoothVertical4Or8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x8_t top_v;
+  if (width == 4) {
+    top_v = Load4(top);
+  } else {  // width == 8
+    top_v = vld1_u8(top);
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+
+    const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
+    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred_scaled);
+    } else {  // width == 8
+      vst1_u8(dst, pred_scaled);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateVerticalWeightsAndPred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+  const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
+  const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothVertical16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t bottom_left = left[height - 1];
+  const uint8_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint8x16_t top_v[4];
+  top_v[0] = vld1q_u8(top);
+  if (width > 16) {
+    top_v[1] = vld1q_u8(top + 16);
+    if (width == 64) {
+      top_v[2] = vld1q_u8(top + 32);
+      top_v[3] = vld1q_u8(top + 48);
+    }
+  }
+
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+
+    const uint8x16_t pred_0 =
+        CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 =
+          CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 =
+            CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 =
+            CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+
+    dst += stride;
+  }
+}
+
+template <int width, int height>
+inline void SmoothHorizontal4Or8xN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  // Over-reads for 4xN but still within the array.
+  const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
+  // 256 - weights = vneg_s8(weights)
+  const uint8x8_t scaled_weights_x =
+      vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+    const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+    const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
+    const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+
+    if (width == 4) {
+      StoreLo4(dst, pred_scaled);
+    } else {  // width == 8
+      vst1_u8(dst, pred_scaled);
+    }
+    dst += stride;
+  }
+}
+
+inline uint8x16_t CalculateHorizontalWeightsAndPred(
+    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x) {
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_tr_low =
+      vmull_u8(vget_low_u8(scaled_weights_x), top_right);
+  const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
+  const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_tr_high =
+      vmull_u8(vget_high_u8(scaled_weights_x), top_right);
+  const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, kSmoothWeightScale);
+
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+template <int width, int height>
+inline void SmoothHorizontal16PlusxN_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const uint8_t top_right = top[width - 1];
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+
+  uint8x16_t weights_x[4];
+  weights_x[0] = vld1q_u8(kSmoothWeights + width - 4);
+  if (width > 16) {
+    weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4);
+    if (width == 64) {
+      weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4);
+      weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4);
+    }
+  }
+
+  uint8x16_t scaled_weights_x[4];
+  scaled_weights_x[0] =
+      vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+  if (width > 16) {
+    scaled_weights_x[1] =
+        vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+    if (width == 64) {
+      scaled_weights_x[2] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
+      scaled_weights_x[3] =
+          vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint8x8_t left_v = vdup_n_u8(left[y]);
+
+    const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred(
+        left_v, top_right_v, weights_x[0], scaled_weights_x[0]);
+    vst1q_u8(dst, pred_0);
+
+    if (width > 16) {
+      const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred(
+          left_v, top_right_v, weights_x[1], scaled_weights_x[1]);
+      vst1q_u8(dst + 16, pred_1);
+
+      if (width == 64) {
+        const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[2], scaled_weights_x[2]);
+        vst1q_u8(dst + 32, pred_2);
+
+        const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred(
+            left_v, top_right_v, weights_x[3], scaled_weights_x[3]);
+        vst1q_u8(dst + 48, pred_3);
+      }
+    }
+    dst += stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4xN_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<4, 16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<4, 16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8xN_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical4Or8xN_NEON<8, 32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4Or8xN_NEON<8, 32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Smooth16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16PlusxN_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16PlusxN_NEON<64, 64>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint16_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+template <int height>
+inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
+
+  // Weighted top right doesn't change with each row.
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // Each variable in the running summation is named for the last item to be
+    // accumulated.
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_top, weights_x_v, left[y]);
+    const uint32x4_t weighted_bl =
+        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+    const uint16x4_t pred = vrshrn_n_u32(weighted_bl, kSmoothWeightScale + 1);
+    vst1_u16(reinterpret_cast<uint16_t*>(dst), pred);
+    dst += stride;
+  }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
+                           const uint32x4_t& weighted_corners_low,
+                           const uint32x4_t& weighted_corners_high,
+                           const uint16x4x2_t& top_vals,
+                           const uint16x4x2_t& weights_x, const uint16_t left_y,
+                           const uint16_t weight_y) {
+  // Each variable in the running summation is named for the last item to be
+  // accumulated.
+  const uint32x4_t weighted_top_low =
+      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+  const uint32x4_t weighted_edges_low =
+      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+  const uint16x4_t pred_low =
+      vrshrn_n_u32(weighted_edges_low, kSmoothWeightScale + 1);
+  vst1_u16(dst, pred_low);
+
+  const uint32x4_t weighted_top_high =
+      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+  const uint32x4_t weighted_edges_high =
+      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+  const uint16x4_t pred_high =
+      vrshrn_n_u32(weighted_edges_high, kSmoothWeightScale + 1);
+  vst1_u16(dst + 4, pred_high);
+}
+
+template <int height>
+inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t top_vals = {vld1_u16(top), vld1_u16(top + 4)};
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+  // Weighted top right doesn't change with each row.
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_corners_low =
+        vaddq_u32(weighted_bl, weighted_tr_low);
+    const uint32x4_t weighted_corners_high =
+        vaddq_u32(weighted_bl, weighted_tr_high);
+    CalculatePred8(reinterpret_cast<uint16_t*>(dst), weighted_corners_low,
+                   weighted_corners_high, top_vals, weights_x, left[y],
+                   weights_y[y]);
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const void* LIBGAV1_RESTRICT const top_row,
+                           const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weight_scaling = vdup_n_u16(256);
+  // Precompute weighted values that don't vary with |y|.
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+    const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const int x = i << 3;
+      const uint16x4x2_t top_vals = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+      const uint32x4_t weighted_corners_low =
+          vaddq_u32(weighted_bl, weighted_tr_low[i]);
+      const uint32x4_t weighted_corners_high =
+          vaddq_u32(weighted_bl, weighted_tr_high[i]);
+      // Accumulate weighted edge values and store.
+      const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + width - 4 + x),
+                                      vld1_u16(kSmoothWeights + width + x)};
+      CalculatePred8(dst_x, weighted_corners_low, weighted_corners_high,
+                     top_vals, weights_x, left[y], weights_y[y]);
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothVertical4xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_v = vld1_u16(top);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top, kSmoothWeightScale));
+
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothVertical8xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t top_low = vld1_u16(top);
+  const uint16x4_t top_high = vld1_u16(top + 4);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    const uint32x4_t weighted_top_low =
+        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_top_high =
+        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothVerticalWxH_NEON(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t bottom_left = left[height - 1];
+  const uint16_t* const weights_y = kSmoothWeights + height - 4;
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  uint16x4x2_t top_vals[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    top_vals[i] = {vld1_u16(top + x), vld1_u16(top + x + 4)};
+  }
+
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  for (int y = 0; y < height; ++y) {
+    // |weighted_bl| is invariant across the row.
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_top_low =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_top_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_top_high =
+          vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_top_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothHorizontal4xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[3];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
+  const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_tr, weights_x, left[y]);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+template <int height>
+inline void SmoothHorizontal8xH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[7];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
+                                  vld1_u16(kSmoothWeights + 8)};
+
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    const uint32x4_t weighted_left_low =
+        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+    vst1_u16(dst16, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+    const uint32x4_t weighted_left_high =
+        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+    vst1_u16(dst16 + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+    dst += stride;
+  }
+}
+
+// For width 16 and above.
+template <int width, int height>
+inline void SmoothHorizontalWxH_NEON(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint16_t*>(top_row);
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  const uint16_t top_right = top[width - 1];
+
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint16x4_t weight_scaling = vdup_n_u16(256);
+
+  uint16x4_t weights_x_low[width >> 3];
+  uint16x4_t weights_x_high[width >> 3];
+  uint32x4_t weighted_tr_low[width >> 3];
+  uint32x4_t weighted_tr_high[width >> 3];
+  for (int i = 0; i < width >> 3; ++i) {
+    const int x = i << 3;
+    weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
+    weighted_tr_low[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+    weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
+    weighted_tr_high[i] =
+        vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    auto* dst_x = reinterpret_cast<uint16_t*>(dst);
+    const uint16_t left_y = left[y];
+    for (int i = 0; i < width >> 3; ++i) {
+      const uint32x4_t weighted_left_low =
+          vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);
+      vst1_u16(dst_x, vrshrn_n_u32(weighted_left_low, kSmoothWeightScale));
+
+      const uint32x4_t weighted_left_high =
+          vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);
+      vst1_u16(dst_x + 4, vrshrn_n_u32(weighted_left_high, kSmoothWeightScale));
+      dst_x += 8;
+    }
+    dst += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // 4x4
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<4>;
+
+  // 4x8
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<8>;
+
+  // 4x16
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4xH_NEON<16>;
+
+  // 8x4
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<4>;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<4>;
+
+  // 8x8
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<8>;
+
+  // 8x16
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<16>;
+
+  // 8x32
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8xH_NEON<32>;
+
+  // 16x4
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 4>;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 4>;
+
+  // 16x8
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 8>;
+
+  // 16x16
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 16>;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 16>;
+
+  // 16x32
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 32>;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 32>;
+
+  // 16x64
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<16, 64>;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<16, 64>;
+
+  // 32x8
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 8>;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 8>;
+
+  // 32x16
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 16>;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 16>;
+
+  // 32x32
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 32>;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 32>;
+
+  // 32x64
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<32, 64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<32, 64>;
+
+  // 64x16
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 16>;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 16>;
+
+  // 64x32
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 32>;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 32>;
+
+  // 64x64
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVerticalWxH_NEON<64, 64>;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontalWxH_NEON<64, 64>;
+}
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredSmoothInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/intrapred_smooth_neon.h b/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..28b5bd5
--- /dev/null
+++ b/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+// 10bpp
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..617accc
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2785 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+                                        int32x4_t out[4]) {
+  // in:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // 00 10 02 12   a.val[0]
+  // 01 11 03 13   a.val[1]
+  // 20 30 22 32   b.val[0]
+  // 21 31 23 33   b.val[1]
+  const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+  const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+  out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+  out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+  out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+  out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+  // out:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const int32x4_t* const s) {
+  assert(store_count % 4 == 0);
+  for (int i = 0; i < store_count; i += 4) {
+    vst1q_s32(&dst[i * stride + idx], s[i]);
+    vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+    vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+    vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+  }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, int32x4_t* x) {
+  assert(load_count % 4 == 0);
+  for (int i = 0; i < load_count; i += 4) {
+    x[i] = vld1q_s32(&src[i * stride + idx]);
+    x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+    x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+    x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+  const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+  // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+  // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+  // bit lane.
+  const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+  const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+                                                         int32x4_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  assert(sin128 <= 0xfff);
+  const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+  const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+                                                          int32x4_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+  const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip, const int32x4_t min,
+                                            const int32x4_t max) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = vmaxq_s32(vminq_s32(x, max), min);
+  *b = vmaxq_s32(vminq_s32(y, max), min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+  const int32_t cos128 = Cos128(32);
+  const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+  // vqrshlq_s32 will shift right if shift value is negative.
+  const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+  // Clamp result to signed 16 bits.
+  const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+  if (width == 4) {
+    vst1q_s32(dst, result);
+  } else {
+    for (int i = 0; i < width; i += 4) {
+      vst1q_s32(dst, result);
+      dst += 4;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int32x4_t v_src = vld1q_s32(dst);
+    const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+    vst1q_s32(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&dst[i]);
+      const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+      vst1q_s32(&dst[i], xy);
+      i += 4;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t min,
+                                      const int32x4_t max,
+                                      const bool is_last_stage) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[3], false);
+    HadamardRotation(&s[1], &s[2], false);
+  } else {
+    HadamardRotation(&s[0], &s[3], false, min, max);
+    HadamardRotation(&s[1], &s[2], false, min, max);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  // When |is_row| is true, set range to the row range, otherwise, set to the
+  // column range.
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[4], x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    Transpose4x4(s, s);
+  }
+  StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t min,
+                                      const int32x4_t max,
+                                      const bool is_last_stage) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false, min, max);
+  HadamardRotation(&s[6], &s[7], true, min, max);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[7], false);
+    HadamardRotation(&s[1], &s[6], false);
+    HadamardRotation(&s[2], &s[5], false);
+    HadamardRotation(&s[3], &s[4], false);
+  } else {
+    HadamardRotation(&s[0], &s[7], false, min, max);
+    HadamardRotation(&s[1], &s[6], false, min, max);
+    HadamardRotation(&s[2], &s[5], false, min, max);
+    HadamardRotation(&s[3], &s[4], false, min, max);
+  }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    Transpose4x4(&s[0], &s[0]);
+    Transpose4x4(&s[4], &s[4]);
+    StoreDst<4>(dst, step, 0, &s[0]);
+    StoreDst<4>(dst, step, 4, &s[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t min,
+                                       const int32x4_t max,
+                                       const bool is_last_stage) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false, min, max);
+  HadamardRotation(&s[10], &s[11], true, min, max);
+  HadamardRotation(&s[12], &s[13], false, min, max);
+  HadamardRotation(&s[14], &s[15], true, min, max);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false, min, max);
+  HadamardRotation(&s[9], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[15], true, min, max);
+  HadamardRotation(&s[13], &s[14], true, min, max);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[15], false);
+    HadamardRotation(&s[1], &s[14], false);
+    HadamardRotation(&s[2], &s[13], false);
+    HadamardRotation(&s[3], &s[12], false);
+    HadamardRotation(&s[4], &s[11], false);
+    HadamardRotation(&s[5], &s[10], false);
+    HadamardRotation(&s[6], &s[9], false);
+    HadamardRotation(&s[7], &s[8], false);
+  } else {
+    HadamardRotation(&s[0], &s[15], false, min, max);
+    HadamardRotation(&s[1], &s[14], false, min, max);
+    HadamardRotation(&s[2], &s[13], false, min, max);
+    HadamardRotation(&s[3], &s[12], false, min, max);
+    HadamardRotation(&s[4], &s[11], false, min, max);
+    HadamardRotation(&s[5], &s[10], false, min, max);
+    HadamardRotation(&s[6], &s[9], false, min, max);
+    HadamardRotation(&s[7], &s[8], false, min, max);
+  }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : s) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&s[idx], &s[idx]);
+      Transpose4x4(&s[idx + 4], &s[idx + 4]);
+      StoreDst<4>(dst, step, idx, &s[idx]);
+      StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t min,
+                                       const int32x4_t max,
+                                       const bool is_last_stage) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false, min, max);
+  HadamardRotation(&s[18], &s[19], true, min, max);
+  HadamardRotation(&s[20], &s[21], false, min, max);
+  HadamardRotation(&s[22], &s[23], true, min, max);
+  HadamardRotation(&s[24], &s[25], false, min, max);
+  HadamardRotation(&s[26], &s[27], true, min, max);
+  HadamardRotation(&s[28], &s[29], false, min, max);
+  HadamardRotation(&s[30], &s[31], true, min, max);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false, min, max);
+  HadamardRotation(&s[17], &s[18], false, min, max);
+  HadamardRotation(&s[20], &s[23], true, min, max);
+  HadamardRotation(&s[21], &s[22], true, min, max);
+  HadamardRotation(&s[24], &s[27], false, min, max);
+  HadamardRotation(&s[25], &s[26], false, min, max);
+  HadamardRotation(&s[28], &s[31], true, min, max);
+  HadamardRotation(&s[29], &s[30], true, min, max);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false, min, max);
+  HadamardRotation(&s[17], &s[22], false, min, max);
+  HadamardRotation(&s[18], &s[21], false, min, max);
+  HadamardRotation(&s[19], &s[20], false, min, max);
+  HadamardRotation(&s[24], &s[31], true, min, max);
+  HadamardRotation(&s[25], &s[30], true, min, max);
+  HadamardRotation(&s[26], &s[29], true, min, max);
+  HadamardRotation(&s[27], &s[28], true, min, max);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[31], false);
+    HadamardRotation(&s[1], &s[30], false);
+    HadamardRotation(&s[2], &s[29], false);
+    HadamardRotation(&s[3], &s[28], false);
+    HadamardRotation(&s[4], &s[27], false);
+    HadamardRotation(&s[5], &s[26], false);
+    HadamardRotation(&s[6], &s[25], false);
+    HadamardRotation(&s[7], &s[24], false);
+    HadamardRotation(&s[8], &s[23], false);
+    HadamardRotation(&s[9], &s[22], false);
+    HadamardRotation(&s[10], &s[21], false);
+    HadamardRotation(&s[11], &s[20], false);
+    HadamardRotation(&s[12], &s[19], false);
+    HadamardRotation(&s[13], &s[18], false);
+    HadamardRotation(&s[14], &s[17], false);
+    HadamardRotation(&s[15], &s[16], false);
+  } else {
+    HadamardRotation(&s[0], &s[31], false, min, max);
+    HadamardRotation(&s[1], &s[30], false, min, max);
+    HadamardRotation(&s[2], &s[29], false, min, max);
+    HadamardRotation(&s[3], &s[28], false, min, max);
+    HadamardRotation(&s[4], &s[27], false, min, max);
+    HadamardRotation(&s[5], &s[26], false, min, max);
+    HadamardRotation(&s[6], &s[25], false, min, max);
+    HadamardRotation(&s[7], &s[24], false, min, max);
+    HadamardRotation(&s[8], &s[23], false, min, max);
+    HadamardRotation(&s[9], &s[22], false, min, max);
+    HadamardRotation(&s[10], &s[21], false, min, max);
+    HadamardRotation(&s[11], &s[20], false, min, max);
+    HadamardRotation(&s[12], &s[19], false, min, max);
+    HadamardRotation(&s[13], &s[18], false, min, max);
+    HadamardRotation(&s[14], &s[17], false, min, max);
+    HadamardRotation(&s[15], &s[16], false, min, max);
+  }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (auto& o : output) {
+        o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<32>(dst, step, 0, &s[0]);
+  }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, min, max, /*is_last_stage=*/false);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false, min, max);
+  HadamardRotation(&s[34], &s[35], true, min, max);
+  HadamardRotation(&s[36], &s[37], false, min, max);
+  HadamardRotation(&s[38], &s[39], true, min, max);
+  HadamardRotation(&s[40], &s[41], false, min, max);
+  HadamardRotation(&s[42], &s[43], true, min, max);
+  HadamardRotation(&s[44], &s[45], false, min, max);
+  HadamardRotation(&s[46], &s[47], true, min, max);
+  HadamardRotation(&s[48], &s[49], false, min, max);
+  HadamardRotation(&s[50], &s[51], true, min, max);
+  HadamardRotation(&s[52], &s[53], false, min, max);
+  HadamardRotation(&s[54], &s[55], true, min, max);
+  HadamardRotation(&s[56], &s[57], false, min, max);
+  HadamardRotation(&s[58], &s[59], true, min, max);
+  HadamardRotation(&s[60], &s[61], false, min, max);
+  HadamardRotation(&s[62], &s[63], true, min, max);
+
+  // stage 7.
+  ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false, min, max);
+  HadamardRotation(&s[33], &s[34], false, min, max);
+  HadamardRotation(&s[36], &s[39], true, min, max);
+  HadamardRotation(&s[37], &s[38], true, min, max);
+  HadamardRotation(&s[40], &s[43], false, min, max);
+  HadamardRotation(&s[41], &s[42], false, min, max);
+  HadamardRotation(&s[44], &s[47], true, min, max);
+  HadamardRotation(&s[45], &s[46], true, min, max);
+  HadamardRotation(&s[48], &s[51], false, min, max);
+  HadamardRotation(&s[49], &s[50], false, min, max);
+  HadamardRotation(&s[52], &s[55], true, min, max);
+  HadamardRotation(&s[53], &s[54], true, min, max);
+  HadamardRotation(&s[56], &s[59], false, min, max);
+  HadamardRotation(&s[57], &s[58], false, min, max);
+  HadamardRotation(&s[60], &s[63], true, min, max);
+  HadamardRotation(&s[61], &s[62], true, min, max);
+
+  // stage 16.
+  ButterflyRotation_4(&s[61], &s[34], 56, true);
+  ButterflyRotation_4(&s[60], &s[35], 56, true);
+  ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false, min, max);
+  HadamardRotation(&s[33], &s[38], false, min, max);
+  HadamardRotation(&s[34], &s[37], false, min, max);
+  HadamardRotation(&s[35], &s[36], false, min, max);
+  HadamardRotation(&s[40], &s[47], true, min, max);
+  HadamardRotation(&s[41], &s[46], true, min, max);
+  HadamardRotation(&s[42], &s[45], true, min, max);
+  HadamardRotation(&s[43], &s[44], true, min, max);
+  HadamardRotation(&s[48], &s[55], false, min, max);
+  HadamardRotation(&s[49], &s[54], false, min, max);
+  HadamardRotation(&s[50], &s[53], false, min, max);
+  HadamardRotation(&s[51], &s[52], false, min, max);
+  HadamardRotation(&s[56], &s[63], true, min, max);
+  HadamardRotation(&s[57], &s[62], true, min, max);
+  HadamardRotation(&s[58], &s[61], true, min, max);
+  HadamardRotation(&s[59], &s[60], true, min, max);
+
+  // stage 25.
+  ButterflyRotation_4(&s[59], &s[36], 48, true);
+  ButterflyRotation_4(&s[58], &s[37], 48, true);
+  ButterflyRotation_4(&s[57], &s[38], 48, true);
+  ButterflyRotation_4(&s[56], &s[39], 48, true);
+  ButterflyRotation_4(&s[55], &s[40], 112, true);
+  ButterflyRotation_4(&s[54], &s[41], 112, true);
+  ButterflyRotation_4(&s[53], &s[42], 112, true);
+  ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false, min, max);
+  HadamardRotation(&s[33], &s[46], false, min, max);
+  HadamardRotation(&s[34], &s[45], false, min, max);
+  HadamardRotation(&s[35], &s[44], false, min, max);
+  HadamardRotation(&s[36], &s[43], false, min, max);
+  HadamardRotation(&s[37], &s[42], false, min, max);
+  HadamardRotation(&s[38], &s[41], false, min, max);
+  HadamardRotation(&s[39], &s[40], false, min, max);
+  HadamardRotation(&s[48], &s[63], true, min, max);
+  HadamardRotation(&s[49], &s[62], true, min, max);
+  HadamardRotation(&s[50], &s[61], true, min, max);
+  HadamardRotation(&s[51], &s[60], true, min, max);
+  HadamardRotation(&s[52], &s[59], true, min, max);
+  HadamardRotation(&s[53], &s[58], true, min, max);
+  HadamardRotation(&s[54], &s[57], true, min, max);
+  HadamardRotation(&s[55], &s[56], true, min, max);
+
+  // stage 30.
+  ButterflyRotation_4(&s[55], &s[40], 32, true);
+  ButterflyRotation_4(&s[54], &s[41], 32, true);
+  ButterflyRotation_4(&s[53], &s[42], 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 32, true);
+  ButterflyRotation_4(&s[50], &s[45], 32, true);
+  ButterflyRotation_4(&s[49], &s[46], 32, true);
+  ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false, min, max);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false, min, max);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false, min, max);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false, min, max);
+  }
+  //-- end dct 64 stages
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (auto& o : output) {
+        o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<64>(dst, step, 0, &s[0]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+  int32x4_t x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+  s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+  const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+  s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  x[0] = vrshrq_n_s32(x0, 12);
+  x[1] = vrshrq_n_s32(x1, 12);
+  x[2] = vrshrq_n_s32(s[2], 12);
+  x[3] = vrshrq_n_s32(x3, 12);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+    x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+    x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+    x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+    Transpose4x4(x, x);
+  }
+  StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                           2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[2];
+
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src0_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+  const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+  // vqrshlq_s32 will shift right if shift value is negative.
+  vst1q_s32(dst,
+            vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+    s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+    s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+    s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+    const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+    const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+    const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+    vst1q_s32(&dst[i], dst_0);
+    vst1q_s32(&dst[i + width * 1], dst_1);
+    vst1q_s32(&dst[i + width * 2], dst_2);
+    vst1q_s32(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false, min, max);
+  HadamardRotation(&s[1], &s[5], false, min, max);
+  HadamardRotation(&s[2], &s[6], false, min, max);
+  HadamardRotation(&s[3], &s[7], false, min, max);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false, min, max);
+  HadamardRotation(&s[4], &s[6], false, min, max);
+  HadamardRotation(&s[1], &s[3], false, min, max);
+  HadamardRotation(&s[5], &s[7], false, min, max);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : x) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+    StoreDst<4>(dst, step, 0, &x[0]);
+    StoreDst<4>(dst, step, 4, &x[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int32x4_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int32x4_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s32(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s32(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s32(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s32(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false, min, max);
+  HadamardRotation(&s[1], &s[9], false, min, max);
+  HadamardRotation(&s[2], &s[10], false, min, max);
+  HadamardRotation(&s[3], &s[11], false, min, max);
+  HadamardRotation(&s[4], &s[12], false, min, max);
+  HadamardRotation(&s[5], &s[13], false, min, max);
+  HadamardRotation(&s[6], &s[14], false, min, max);
+  HadamardRotation(&s[7], &s[15], false, min, max);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false, min, max);
+  HadamardRotation(&s[8], &s[12], false, min, max);
+  HadamardRotation(&s[1], &s[5], false, min, max);
+  HadamardRotation(&s[9], &s[13], false, min, max);
+  HadamardRotation(&s[2], &s[6], false, min, max);
+  HadamardRotation(&s[10], &s[14], false, min, max);
+  HadamardRotation(&s[3], &s[7], false, min, max);
+  HadamardRotation(&s[11], &s[15], false, min, max);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false, min, max);
+  HadamardRotation(&s[4], &s[6], false, min, max);
+  HadamardRotation(&s[8], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[14], false, min, max);
+  HadamardRotation(&s[1], &s[3], false, min, max);
+  HadamardRotation(&s[5], &s[7], false, min, max);
+  HadamardRotation(&s[9], &s[11], false, min, max);
+  HadamardRotation(&s[13], &s[15], false, min, max);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (auto& i : x) {
+      i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+      StoreDst<4>(dst, step, idx, &x[idx]);
+      StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[16];
+  int32x4_t x[16];
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int i = 0;
+  do {
+    int32x4_t s[16];
+    int32x4_t x[16];
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_mult_lo =
+        vmlaq_s32(v_dual_round, v_src, v_multiplier);
+    const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
+  static_assert(identity_size == 4 || identity_size == 8 ||
+                    identity_size == 16 || identity_size == 32,
+                "Invalid identity_size.");
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      int i = 0;
+      do {
+        int32x4x2_t v_src, v_dst_i, a, b;
+        v_src.val[0] = vld1q_s32(&source[i * 4]);
+        v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+        if (identity_size == 4) {
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        } else if (identity_size == 8) {
+          v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+          v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+          a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+          a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+        } else {  // identity_size == 16
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        }
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst);
+        frame_data.val[1] = vld1_u16(dst + stride);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        dst += stride << 1;
+        i += 2;
+      } while (i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          int32x4x2_t v_src, v_dst_i, a, b;
+          v_src.val[0] = vld1q_s32(&source[row + j]);
+          v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+          if (identity_size == 4) {
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          } else if (identity_size == 8) {
+            v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+            v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+            a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+            a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+          } else {  // identity_size == 16
+            v_dst_i.val[0] =
+                vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+            v_dst_i.val[1] =
+                vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+            a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+            a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+          }
+          uint16x4x2_t frame_data;
+          frame_data.val[0] = vld1_u16(dst + j);
+          frame_data.val[1] = vld1_u16(dst + j + 4);
+          b.val[0] =
+              vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+          b.val[1] =
+              vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+          vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+          vst1_u16(dst + j + 4,
+                   vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int32x4_t v_dst_i = vld1q_s32(&source[row + j]);
+        const uint16x4_t frame_data = vld1_u16(dst + j);
+        const int32x4_t a = vrshrq_n_s32(v_dst_i, 2);
+        const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+        const uint16x4_t d = vmin_u16(vqmovun_s32(b), v_max_bitdepth);
+        vst1_u16(dst + j, d);
+        j += 4;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int32_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+      const int32x4_t v_dst_row =
+          vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+      const int32x4_t v_dst_col =
+          vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+      const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src_round.val[0] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+        v_src_round.val[1] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+        v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+        v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+        v_dst_col.val[0] =
+            vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+        v_dst_col.val[1] =
+            vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+        a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+    const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+    const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int32x4x2_t v_src;
+      v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+      v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+      const int32x4_t v_src_mult_lo =
+          vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+      vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_src_mult_lo =
+      vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 4) {
+      const int32x4_t v_src = vld1q_s32(&dst[i * step + j]);
+      const int32x4_t v_dst_i = vqaddq_s32(v_src, v_src);
+      vst1q_s32(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x2_t v_src0 = vdup_n_s32(dst[0]);
+  const int32x2_t v_src =
+      vqrdmulh_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int32x2_t v_dst_0 = vqadd_s32(v_src, v_src);
+  vst1_lane_s32(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint16_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int32_t*>(source);
+  int32x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int32_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int32_t g = f >> 1;
+    f = f - (f >> 1);
+    const int32_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int32_t i = (src[0] >> 4);
+    s[0] = vdupq_n_s32(h);
+    s[0] = vsetq_lane_s32(f, s[0], 0);
+    s[1] = vdupq_n_s32(i);
+    s[1] = vsetq_lane_s32(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int32x4x4_t columns = vld4q_s32(src);
+
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshrq_n_s32(columns.val[0], 2);
+    s[2] = vshrq_n_s32(columns.val[1], 2);
+    s[3] = vshrq_n_s32(columns.val[2], 2);
+    s[1] = vshrq_n_s32(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    int32x4_t e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+
+    int32x4_t x[4];
+    Transpose4x4(s, x);
+
+    s[0] = x[0];
+    s[2] = x[1];
+    s[3] = x[2];
+    s[1] = x[3];
+
+    // Column transforms.
+    s[0] = vaddq_s32(s[0], s[2]);
+    s[3] = vsubq_s32(s[3], s[1]);
+    e = vhsubq_s32(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsubq_s32(e, s[1]);
+    s[2] = vsubq_s32(e, s[2]);
+    s[0] = vsubq_s32(s[0], s[1]);
+    s[3] = vaddq_s32(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+  for (int row = 0; row < 4; row += 1) {
+    const uint16x4_t frame_data = vld1_u16(dst);
+    const int32x4_t b = vaddw_s16(s[row], vreinterpret_s16_u16(frame_data));
+    vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      const int32x4_t c = vld1q_s32(&source[i + 8]);
+      const int32x4_t d = vld1q_s32(&source[i + 12]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      const int32x4_t c_rev = vrev64q_s32(c);
+      const int32x4_t d_rev = vrev64q_s32(d);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+      vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t a_lo = vld1q_s32(&source[i]);
+    const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+    const int32x4_t b_lo =
+        vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+    const int32x4_t b_hi =
+        vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+    vst1q_s32(&source[i], b_lo);
+    vst1q_s32(&source[i + 4], b_hi);
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s32 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t residual0 = vld1q_s32(&source[i]);
+    const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+    vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+    vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int32_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int32x4_t residual = vld1q_s32(&source[row]);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(residual, 4);
+      const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+      const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+      vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int32x4_t residual = vld1q_s32(&source[row + j]);
+        const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+        const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+        const int32x4_t a = vrshrq_n_s32(residual, 4);
+        const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+        const uint32x4_t b =
+            vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+        const uint32x4_t b_hi =
+            vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+        const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+        const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+        vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+                                          vdupq_n_u16((1 << kBitdepth10) - 1)));
+        j += 8;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+                                   row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct8 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+                                   row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct16 rows in parallel per iteration.
+    Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+    data += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct16 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct32 rows in parallel per iteration.
+    Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+    data += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<32>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct64 rows in parallel per iteration.
+    Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+    data += 128 * 2;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<64>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst8 rows in parallel per iteration.
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+                                    /*transpose=*/true, row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  do {
+    // Process 4 1d adst16 rows in parallel per iteration.
+    Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    int i = tx_width;
+    auto* data = src;
+    do {
+      // Process 4 1d adst16 columns in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                       /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  const int shift = tx_height > 8 ? 1 : 0;
+  int i = adjusted_tx_height;
+  do {
+    Identity4_NEON(src, /*step=*/4, shift);
+    src += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+  // bit value.
+  if ((tx_height & 0x18) != 0) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+      const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+      vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+      vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+    }
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the identity
+  // transform and shifted by 1 afterwards.
+  auto* src = static_cast<int32_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int32_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  uint16_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
+}
+
+}  // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
new file mode 100644
index 0000000..1c2e111
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -0,0 +1,3211 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+// TODO(slavarnway): Move transpose functions to transpose_neon.h or
+// common_neon.h.
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
+                                        int16x8_t out[4]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  const int16x4_t a0 = vget_low_s16(in[0]);
+  const int16x4_t a1 = vget_low_s16(in[1]);
+  const int16x4_t a2 = vget_low_s16(in[2]);
+  const int16x4_t a3 = vget_low_s16(in[3]);
+
+  const int16x4x2_t b0 = vtrn_s16(a0, a1);
+  const int16x4x2_t b1 = vtrn_s16(a2, a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
+  const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
+  const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
+  const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
+
+  out[0] = vcombine_s16(d0, d0);
+  out[1] = vcombine_s16(d1, d1);
+  out[2] = vcombine_s16(d2, d2);
+  out[3] = vcombine_s16(d3, d3);
+}
+
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
+LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
+                                        int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+  const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]);
+  const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8],
+                                             uint16x8_t out[4]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1]));
+  uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3]));
+  uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5]));
+  uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7]));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                        vreinterpret_u16_u32(c2.val[0]));
+  out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                        vreinterpret_u16_u32(c3.val[0]));
+  out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                        vreinterpret_u16_u32(c2.val[1]));
+  out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                        vreinterpret_u16_u32(c3.val[1]));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8],
+                                             int16x8_t out[4]) {
+  Transpose4x8To8x4(reinterpret_cast<const uint16x8_t*>(in),
+                    reinterpret_cast<uint16x8_t*>(out));
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4],
+                                             int16x8_t out[8]) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+
+  // The upper 8 bytes are don't cares.
+  // out[0]: 00 10 20 30 04 14 24 34
+  // out[1]: 01 11 21 31 05 15 25 35
+  // out[2]: 02 12 22 32 06 16 26 36
+  // out[3]: 03 13 23 33 07 17 27 37
+  // out[4]: 04 14 24 34 04 14 24 34
+  // out[5]: 05 15 25 35 05 15 25 35
+  // out[6]: 06 16 26 36 06 16 26 36
+  // out[7]: 07 17 27 37 07 17 27 37
+  out[0] = vreinterpretq_s16_s32(c0.val[0]);
+  out[1] = vreinterpretq_s16_s32(c1.val[0]);
+  out[2] = vreinterpretq_s16_s32(c0.val[1]);
+  out[3] = vreinterpretq_s16_s32(c1.val[1]);
+  out[4] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0])));
+  out[5] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0])));
+  out[6] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1])));
+  out[7] = vreinterpretq_s16_s32(
+      vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1])));
+}
+
+//------------------------------------------------------------------------------
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const int16x8_t* const s) {
+  assert(store_count % 4 == 0);
+  assert(store_width == 8 || store_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      vst1q_s16(&dst[i * stride + idx], (s[i]));
+      vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1]));
+      vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2]));
+      vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3]));
+    }
+  } else {
+    // store_width == 8
+    for (int i = 0; i < store_count; i += 4) {
+      vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i]));
+      vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1]));
+      vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2]));
+      vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3]));
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, int16x8_t* x) {
+  assert(load_count % 4 == 0);
+  assert(load_width == 8 || load_width == 16);
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = vld1q_s16(&src[i * stride + idx]);
+      x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]);
+    }
+  } else {
+    // load_width == 8
+    const int64x2_t zero = vdupq_n_s64(0);
+    for (int i = 0; i < load_count; i += 4) {
+      // The src buffer is aligned to 32 bytes.  Each load will always be 8
+      // byte aligned.
+      x[i] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[i * stride + idx]), zero, 0));
+      x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 1) * stride + idx]), zero,
+          0));
+      x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 2) * stride + idx]), zero,
+          0));
+      x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64(
+          reinterpret_cast<const int64_t*>(&src[(i + 3) * stride + idx]), zero,
+          0));
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+  const int16x8_t x = vcombine_s16(x1, x1);
+  const int16x8_t y = vcombine_s16(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128);
+  const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+  const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+  const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128);
+  const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
+                                                         int16x8_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+    defined(__clang__)  // ARM v8.1-A
+  // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
+  // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
+  // vqrdmulhq_n_s16().
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128);
+  const int32x4_t y0 = vmull_n_s16(vget_low_s16(*b), cos128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*b), -sin128);
+  const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*b), cos128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#else
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  // For this function, the max value returned by Sin128() is 4091, which fits
+  // inside 12 bits.  This leaves room for the sign bit and the 3 left shifted
+  // bits.
+  assert(sin128 <= 0xfff);
+  const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a,
+                                                          int16x8_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+#if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
+    defined(__clang__)  // ARM v8.1-A
+  // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
+  // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
+  // vqrdmulhq_n_s16().
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmull_n_s16(vget_low_s16(*a), cos128);
+  const int32x4_t y0 = vmull_n_s16(vget_low_s16(*a), sin128);
+  const int16x4_t x1 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t y1 = vqrshrn_n_s32(y0, 12);
+
+  const int32x4_t x0_hi = vmull_n_s16(vget_high_s16(*a), cos128);
+  const int32x4_t y0_hi = vmull_n_s16(vget_high_s16(*a), sin128);
+  const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12);
+  const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12);
+
+  const int16x8_t x = vcombine_s16(x1, x1_hi);
+  const int16x8_t y = vcombine_s16(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#else
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+#endif
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b,
+                                            bool flip) {
+  int16x8_t x, y;
+  if (flip) {
+    y = vqaddq_s16(*b, *a);
+    x = vqsubq_s16(*b, *a);
+  } else {
+    x = vqaddq_s16(*a, *b);
+    y = vqsubq_s16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
+  const int16_t cos128 = Cos128(32);
+  const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
+  // vqrshlq_s16 will shift right if shift value is negative.
+  const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
+
+  if (width == 4) {
+    vst1_s16(dst, vget_low_s16(xy_shifted));
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      vst1q_s16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int16x4_t v_src = vld1_s16(dst);
+    const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
+    vst1_s16(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int16x8_t v_src = vld1q_s16(&dst[i]);
+      const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
+      vst1q_s16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[8];
+      Transpose8x4To4x8(s, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4(s, s);
+    }
+    StoreDst<8, 4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else if (transpose) {
+    LoadSrc<16, 8>(dst, step, 0, x);
+    dsp::Transpose8x8(x);
+  } else {
+    LoadSrc<16, 8>(dst, step, 0, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else if (transpose) {
+    dsp::Transpose8x8(s);
+    StoreDst<16, 8>(dst, step, 0, s);
+  } else {
+    StoreDst<16, 8>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 16>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (auto& i : s) {
+      i = vqrshlq_s16(i, v_row_shift);
+    }
+  }
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      dsp::Transpose8x8(&s[idx]);
+      StoreDst<16, 8>(dst, step, idx, &s[idx]);
+    }
+  } else {
+    StoreDst<16, 16>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (is_row) {
+    const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int16x8_t output[8];
+      Transpose8x8(&s[idx], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int32x4_t s[8];
+  int16x8_t x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+  }
+
+  // stage 1.
+  s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
+  s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
+  const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+
+  // stage 3.
+  s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
+  s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+
+  s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+  const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+  const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
+  const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+  x[0] = vcombine_s16(dst_0, dst_0);
+  x[1] = vcombine_s16(dst_1, dst_1);
+  x[2] = vcombine_s16(dst_2, dst_2);
+  x[3] = vcombine_s16(dst_3, dst_3);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[8];
+      Transpose8x4To4x8(x, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4(x, x);
+    }
+    StoreDst<8, 4>(dst, step, 0, x);
+  }
+}
+
+alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                          2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[2];
+
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12);
+
+  // vqrshlq_s16 will shift right if shift value is negative.
+  vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift)));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int16x4_t v_src = vld1_s16(&dst[i]);
+
+    s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
+    s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
+    s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12);
+    const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12);
+    const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12);
+    const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
+
+    vst1_s16(&dst[i], dst_0);
+    vst1_s16(&dst[i + width * 1], dst_1);
+    vst1_s16(&dst[i + width * 2], dst_2);
+    vst1_s16(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+                                      bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      LoadSrc<16, 8>(dst, step, 0, x);
+      dsp::Transpose8x8(x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      dsp::Transpose8x8(x);
+      StoreDst<16, 8>(dst, step, 0, x);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int16x8_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s16(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s16(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s16(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[8];
+
+  int i = 0;
+  do {
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int16x8_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s16(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s16(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s16(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s16(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      int16x8_t input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+        dsp::Transpose8x8(&x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+
+  if (stage_is_rectangular) {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      int16x8_t output[4];
+      Transpose4x8To8x4(x, output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4(&x[8], output);
+      for (auto& o : output) {
+        o = vqrshlq_s16(o, v_row_shift);
+      }
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (is_row) {
+      const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
+      for (int idx = 0; idx < 16; idx += 8) {
+        int16x8_t output[8];
+        Transpose8x8(&x[idx], output);
+        for (auto& o : output) {
+          o = vqrshlq_s16(o, v_row_shift);
+        }
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s16(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s16(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s16(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s16(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s16(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s16(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s16(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s16(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int16x8_t s[16];
+  int16x8_t x[16];
+
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
+  const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
+  const int16x8_t v_src_round =
+      vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+  // stage 1.
+  s[1] = vbslq_s16(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s16 will shift right if shift value is negative.
+    x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift));
+    vst1q_lane_s16(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    int16x8_t s[16];
+    int16x8_t x[16];
+    const int16x8_t v_src = vld1q_s16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1_s16(&dst[j * width], vget_low_s16(x[j]));
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+    const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+    const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  } else {
+    for (int i = 0; i < 4; i += 2) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+      const int16x8_t a =
+          vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x8_t b = vqaddq_s16(v_src, a);
+      vst1q_s16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (identity_size < 32) {
+    if (tx_width == 4) {
+      uint8x8_t frame_data = vdup_n_u8(0);
+      int i = 0;
+      do {
+        const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+
+        int16x4_t v_dst_i;
+        if (identity_size == 4) {
+          const int16x4_t v_src_fraction =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+          v_dst_i = vqadd_s16(v_src, v_src_fraction);
+        } else if (identity_size == 8) {
+          v_dst_i = vqadd_s16(v_src, v_src);
+        } else {  // identity_size == 16
+          const int16x4_t v_src_mult =
+              vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+          const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src);
+          v_dst_i = vqadd_s16(v_srcx2, v_src_mult);
+        }
+
+        frame_data = Load4<0>(dst, frame_data);
+        const int16x4_t a = vrshr_n_s16(v_dst_i, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        StoreLo4(dst, d);
+        dst += stride;
+      } while (++i < tx_height);
+    } else {
+      int i = 0;
+      do {
+        const int row = i * tx_width;
+        int j = 0;
+        do {
+          const int16x8_t v_src = vld1q_s16(&source[row + j]);
+
+          int16x8_t v_dst_i;
+          if (identity_size == 4) {
+            const int16x8_t v_src_fraction =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+            v_dst_i = vqaddq_s16(v_src, v_src_fraction);
+          } else if (identity_size == 8) {
+            v_dst_i = vqaddq_s16(v_src, v_src);
+          } else {  // identity_size == 16
+            const int16x8_t v_src_mult =
+                vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4);
+            const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+            v_dst_i = vqaddq_s16(v_src_mult, v_srcx2);
+          }
+
+          const uint8x8_t frame_data = vld1_u8(dst + j);
+          const int16x8_t a = vrshrq_n_s16(v_dst_i, 4);
+          const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+          const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+          vst1_u8(dst + j, d);
+          j += 8;
+        } while (j < tx_width);
+        dst += stride;
+      } while (++i < tx_height);
+    }
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_dst_i = vld1q_s16(&source[row + j]);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_i, 2);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    int i = 0;
+    do {
+      const int16x4_t v_src = vld1_s16(&source[i * tx_width]);
+      const int16x4_t v_src_mult =
+          vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult);
+      const int16x4_t v_src_mult2 =
+          vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+      const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(v_dst_col, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const int16x8_t v_src = vld1q_s16(&source[row + j]);
+        const int16x8_t v_src_round =
+            vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
+        const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round);
+        const int16x8_t v_src_mult2 =
+            vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3);
+        const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2);
+        const uint8x8_t frame_data = vld1_u8(dst + j);
+        const int16x8_t a = vrshrq_n_s16(v_dst_col, 4);
+        const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+        const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+        vst1_u8(dst + j, d);
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    const int16x8_t a = vrshrq_n_s16(v_src, 1);
+    vst1q_s16(&dst[i * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
+    vst1q_s16(&dst[i * step], v_srcx2);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
+      const int32x4_t v_src_mult_lo =
+          vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi = vmlal_n_s16(
+          v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s16(&dst[i * step + j * 8],
+                vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
+  const int16x4_t v_src_round =
+      vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo =
+      vmlal_s16(v_dual_round, (v_src), v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+                                                const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 32; j += 8) {
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
+      vst1q_s16(&dst[i * step + j], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
+  const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
+  vst1_lane_s16(dst, v_dst_0, 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix
+// for the WHT. The input matrix is in two "wide" int16x8_t variables. The
+// output matrix is in four int16x4_t variables.
+//
+// Input:
+// in[0]: 00 01 02 03  10 11 12 13
+// in[1]: 20 21 22 23  30 31 32 33
+// Output:
+// out[0]: 00 10 20 30
+// out[1]: 03 13 23 33
+// out[2]: 01 11 21 31
+// out[3]: 02 12 22 32
+LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput(
+    const int16x8_t in[2], int16x4_t out[4]) {
+  // Swap 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03  10 11 12 13
+  // in[1]: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+
+  // Swap 16 bit elements. Goes from:
+  // vget_low_s32(b0.val[0]):  00 01 20 21
+  // vget_high_s32(b0.val[0]): 10 11 30 31
+  // vget_low_s32(b0.val[1]):  02 03 22 23
+  // vget_high_s32(b0.val[1]): 12 13 32 33
+  // to:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 01 11 21 32
+  // c1.val[0]: 02 12 22 32
+  // c1.val[1]: 03 13 23 33
+
+  const int16x4x2_t c0 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  const int16x4x2_t c1 =
+      vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+               vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+
+  out[0] = c0.val[0];
+  out[1] = c1.val[1];
+  out[2] = c0.val[1];
+  out[3] = c1.val[0];
+}
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* LIBGAV1_RESTRICT dst,
+                                     const int dst_stride,
+                                     const void* LIBGAV1_RESTRICT source,
+                                     const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  int16x4_t s[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = vdup_n_s16(h);
+    s[0] = vset_lane_s16(f, s[0], 0);
+    s[1] = vdup_n_s16(i);
+    s[1] = vset_lane_s16(g, s[1], 0);
+    s[2] = s[3] = s[1];
+  } else {
+    // Load the 4x4 source in transposed form.
+    int16x4x4_t columns = vld4_s16(src);
+    // Shift right and permute the columns for the WHT.
+    s[0] = vshr_n_s16(columns.val[0], 2);
+    s[2] = vshr_n_s16(columns.val[1], 2);
+    s[3] = vshr_n_s16(columns.val[2], 2);
+    s[1] = vshr_n_s16(columns.val[3], 2);
+
+    // Row transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    int16x4_t e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+
+    int16x8_t x[2];
+    x[0] = vcombine_s16(s[0], s[1]);
+    x[1] = vcombine_s16(s[2], s[3]);
+    TransposeAndPermute4x4WideInput(x, s);
+
+    // Column transforms.
+    s[0] = vadd_s16(s[0], s[2]);
+    s[3] = vsub_s16(s[3], s[1]);
+    e = vhsub_s16(s[0], s[3]);  // e = (s[0] - s[3]) >> 1
+    s[1] = vsub_s16(e, s[1]);
+    s[2] = vsub_s16(e, s[2]);
+    s[0] = vsub_s16(s[0], s[1]);
+    s[3] = vadd_s16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  uint8x8_t frame_data = vdup_n_u8(0);
+  for (int row = 0; row < 4; row += 2) {
+    frame_data = Load4<0>(dst, frame_data);
+    frame_data = Load4<1>(dst + dst_stride, frame_data);
+    const int16x8_t residual = vcombine_s16(s[row], s[row + 1]);
+    const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data);
+    frame_data = vqmovun_s16(vreinterpretq_s16_u16(b));
+    StoreLo4(dst, frame_data);
+    dst += dst_stride;
+    StoreHi4(dst, frame_data);
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vld1q_s16(&source[i + 8]);
+      const int16x8_t c = vrev64q_s16(a);
+      const int16x8_t d = vrev64q_s16(b);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d)));
+      vst1q_s16(&source[i + 8],
+                vcombine_s16(vget_high_s16(c), vget_low_s16(c)));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vrev64q_s16(a);
+      vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b)));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vrev64q_s16(a));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t a = vld1q_s16(&source[i]);
+      const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+      vst1q_s16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const int16x8_t a = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3);
+        vst1q_s16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s16 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const int16x8_t residual = vld1q_s16(&source[i]);
+      vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift)));
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]);
+        const int16x8_t residual_shifted =
+            vqrshlq_s16(residual, vdupq_n_s16(row_shift));
+        vst1q_s16(&source[i * tx_width + j], residual_shifted);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int16_t* LIBGAV1_RESTRICT source,
+    TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  // Enable for 4x4, 4x8, 4x16
+  if (tx_height < 32 && tx_width == 4) {
+    uint8x8_t frame_data = vdup_n_u8(0);
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int16x4_t residual = vld1_s16(&source[row]);
+      frame_data = Load4<0>(dst, frame_data);
+      const int16x4_t a = vrshr_n_s16(residual, 4);
+      const uint16x8_t b =
+          vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      StoreLo4(dst, d);
+      dst += stride;
+    }
+    // Enable for 8x4, 8x8, 8x16, 8x32
+  } else if (tx_height < 64 && tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const int16x8_t residual = vld1q_s16(&source[row]);
+      const uint8x8_t frame_data = vld1_u8(dst);
+      const int16x8_t a = vrshrq_n_s16(residual, 4);
+      const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data);
+      const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b));
+      vst1_u8(dst, d);
+      dst += stride;
+    }
+    // Remaining widths >= 16.
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int16x8_t residual = vld1q_s16(&source[row + j]);
+        const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]);
+        const uint8x16_t frame_data = vld1q_u8(frame[y] + x);
+        const int16x8_t a = vrshrq_n_s16(residual, 4);
+        const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4);
+        const uint16x8_t b =
+            vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data));
+        const uint16x8_t b_hi =
+            vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data));
+        vst1q_u8(frame[y] + x,
+                 vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)),
+                             vqmovun_s16(vreinterpretq_s16_u16(b_hi))));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+                                           /*transpose=*/true);
+      data += 32;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
+                                             /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                            /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                              /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                             row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                            /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
+                                               /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                             /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height,
+                                   void* LIBGAV1_RESTRICT src_buffer,
+                                   int start_x, int start_y,
+                                   void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = tx_width;
+      auto* data = src;
+      do {
+        Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
+                                               /*transpose=*/false);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                              row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                             /*row_shift=*/0);
+    } else {
+      int i = tx_width;
+      auto* data = src;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_NEON<ButterflyRotation_8, false>(
+            data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<false>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  } else {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<true>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height,
+                                       void* LIBGAV1_RESTRICT src_buffer,
+                                       int start_x, int start_y,
+                                       void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16.  The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* LIBGAV1_RESTRICT src_buffer,
+                                        int start_x, int start_y,
+                                        void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height,
+                                  void* LIBGAV1_RESTRICT src_buffer,
+                                  int start_x, int start_y,
+                                  void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  uint8_t* dst = frame[start_y] + start_x;
+  const int dst_stride = frame.columns();
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h
new file mode 100644
index 0000000..ebd7cf4
--- /dev/null
+++ b/src/dsp/arm/inverse_transform_neon.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
new file mode 100644
index 0000000..8c03928
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -0,0 +1,2454 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
+  const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
+  return vorr_u8(a, RightShiftVector<32>(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                                const uint8_t outer_thresh) {
+  const uint8x8x2_t a = Interleave32(p0q0, p1q1);
+  const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
+  const uint8x8_t p0q0_double = vqadd_u8(b, b);
+  const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
+  const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
+  return vcle_u8(c, vdup_n_u8(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh, uint8x8_t* const hev_mask,
+                         uint8x8_t* const needs_filter4_mask) {
+  // First half is |p0 - p1|, second half is |q0 - q1|.
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask =
+      NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1,
+                    const uint8x8_t hev_mask, uint8x8_t* const p1q1_result,
+                    uint8x8_t* const p0q0_result) {
+  const int16x4_t zero = vdup_n_s16(0);
+
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero));
+  const int8x8_t hev_option =
+      vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a =
+      vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option));
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4));
+  const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3));
+  const int8x8_t a2_a1 =
+      vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3);
+
+  // a3 is in the high 4 values.
+  // a3 = (a1 + 1) >> 1;
+  const int8x8_t a3 = vrshr_n_s8(a2_a1, 1);
+
+  const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1));
+  const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1));
+
+  const int16x8_t p1q1_l =
+      vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l));
+
+  const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3));
+  const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3);
+
+  const int16x8_t p0q0_l =
+      vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
+  // Need to shift the second term or we end up with a2_ma2.
+  const int8x8_t a2_ma1 =
+      InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
+  const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
+
+  *p1q1_result = vqmovun_s16(p1q1_a3);
+  *p0q0_result = vqmovun_s16(p0q0_a);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1);
+
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+  StoreHi4(dst + stride, p1q1_output);
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 2;
+
+  // |p1q0| and |p0q1| are named for the values they will contain after the
+  // transpose.
+  const uint8x8_t row0 = Load4(dst);
+  uint8x8_t p1q0 = Load4<1>(dst + stride, row0);
+  const uint8x8_t row2 = Load4(dst + 2 * stride);
+  uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2);
+
+  Transpose4x4(&p1q0, &p0q1);
+  // Rearrange.
+  const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1));
+  const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0],
+                                 Transpose32(p1q1xq0p0.val[1])};
+
+  uint8x8_t hev_mask;
+  uint8x8_t needs_filter4_mask;
+  Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh,
+               inner_thresh, &hev_mask, &needs_filter4_mask);
+
+  // Copy the masks to the high bits for packed comparisons later.
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+  needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint8x8_t p0q0_output =
+      vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask);
+  const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]);
+
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0],
+            output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1,
+                         const uint8x8_t abd_p0p2_q0q2) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
+  return vand_u8(b, RightShiftVector<32>(b));
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                         const uint8x8_t p0q0, const uint8_t hev_thresh,
+                         const uint8_t outer_thresh, const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter6_mask,
+                         uint8x8_t* const is_flat3_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1,
+                                     inner_thresh, outer_thresh);
+}
+
+inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1,
+                    const uint8x8_t p0q0, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2);
+  uint16x8_t sum = vaddw_u8(p2q2_double, p2q2);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                 ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                      ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                            ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //           ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum);
+
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //                                       ^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  // Move |dst| to the left side of the filter window.
+  dst -= 3;
+
+  // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will
+  // contain after the transpose.
+  // These over-read by 2 bytes. We only need 6.
+  uint8x8_t p2q1 = vld1_u8(dst);
+  uint8x8_t p1q2 = vld1_u8(dst + stride);
+  uint8x8_t p0xx = vld1_u8(dst + 2 * stride);
+  uint8x8_t q0xx = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx);
+
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+  const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx);
+
+  uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask;
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter6_mask, &is_flat3_mask, &hev_mask);
+
+  needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask);
+  is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter6_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f6_p1q1, f6_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) {
+    // Filter6() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f6_p1q1 = zero;
+    f6_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0);
+
+  // The six tap filter is only six taps on input. Output is limited to p1-q1.
+  dst += 1;
+  // Put things back in order to reverse the transpose.
+  const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output);
+  uint8x8_t output_0 = p1p0xq1q0.val[0];
+  uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]);
+
+  Transpose4x4(&output_0, &output_1);
+
+  StoreLo4(dst, output_0);
+  StoreLo4(dst + stride, output_1);
+  StoreHi4(dst + 2 * stride, output_0);
+  StoreHi4(dst + 3 * stride, output_1);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 1 for 8 bit decode.
+inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0,
+                         const uint8x8_t abd_p0n1_q0n1,
+                         const uint8x8_t abd_p0n2_q0n2) {
+  const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
+  const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
+  return vand_u8(c, RightShiftVector<32>(c));
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1,
+                              const uint8x8_t abd_p1p2_q1q2,
+                              const uint8x8_t abd_p2p3_q2q3,
+                              const uint8x8_t p0q0, const uint8x8_t p1q1,
+                              const uint8_t inner_thresh,
+                              const uint8_t outer_thresh) {
+  const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
+  const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
+  const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
+  const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
+  return vand_u8(inner_mask, outer_mask);
+}
+
+inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                         const uint8x8_t p1q1, const uint8x8_t p0q0,
+                         const uint8_t hev_thresh, const uint8_t outer_thresh,
+                         const uint8_t inner_thresh,
+                         uint8x8_t* const needs_filter8_mask,
+                         uint8x8_t* const is_flat4_mask,
+                         uint8x8_t* const hev_mask) {
+  const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1);
+  *hev_mask = Hev(p0p1_q0q1, hev_thresh);
+  *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0,
+                   p1q1, inner_thresh, outer_thresh);
+}
+
+inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2,
+                    const uint8x8_t p1q1, const uint8x8_t p0q0,
+                    uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                    uint8x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddl_u8(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddl_u8(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddw_u8(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p2q2_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(sum, vaddl_u8(p1q1, q1p1));
+
+  *p1q1_output = vrshrn_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(sum, vaddl_u8(p0q0, q2p2));
+
+  *p0q0_output = vrshrn_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      const int outer_thresh, const int inner_thresh,
+                      const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+    const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+    StoreLo4(dst - 3 * stride, p2p2_output);
+    StoreHi4(dst + 2 * stride, p2p2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
+                    const int outer_thresh, const int inner_thresh,
+                    const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  // Move |dst| to the left side of the filter window.
+  dst -= 4;
+
+  // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will
+  // contain after the transpose.
+  uint8x8_t p3q0 = vld1_u8(dst);
+  uint8x8_t p2q1 = vld1_u8(dst + stride);
+  uint8x8_t p1q2 = vld1_u8(dst + 2 * stride);
+  uint8x8_t p0q3 = vld1_u8(dst + 3 * stride);
+
+  Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+  const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3));
+  const uint8x8_t p3q3 = p3q3xq0p0.val[0];
+  const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]);
+  const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2));
+  const uint8x8_t p2q2 = p2q2xq1p1.val[0];
+  const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() does not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p2q2 = zero;
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  // Always prepare and store p2/q2 because we need to transpose it anyway.
+  const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2);
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  // Write out p3/q3 as well. There isn't a good way to write out 6 bytes.
+  // Variable names reflect the values before transposition.
+  const uint8x8x2_t p3q0xq3p0_output =
+      Interleave32(p3q3, Transpose32(p0q0_output));
+  uint8x8_t p3q0_output = p3q0xq3p0_output.val[0];
+  uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]);
+  const uint8x8x2_t p2q1xq2p1_output =
+      Interleave32(p2q2_output, Transpose32(p1q1_output));
+  uint8x8_t p2q1_output = p2q1xq2p1_output.val[0];
+  uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]);
+
+  Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output);
+
+  vst1_u8(dst, p3q0_output);
+  vst1_u8(dst + stride, p2q1_output);
+  vst1_u8(dst + 2 * stride, p1q2_output);
+  vst1_u8(dst + 3 * stride, p0q3_output);
+}
+
+inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5,
+                     const uint8x8_t p4q4, const uint8x8_t p3q3,
+                     const uint8x8_t p2q2, const uint8x8_t p1q1,
+                     const uint8x8_t p0q0, uint8x8_t* const p5q5_output,
+                     uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output,
+                     uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output,
+                     uint8x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                          ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                            ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint8x8_t q0p0 = Transpose32(p0q0);
+  sum = vaddw_u8(sum, q0p0);
+
+  *p5q5_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6));
+  const uint8x8_t q1p1 = Transpose32(p1q1);
+  sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5));
+  const uint8x8_t q2p2 = Transpose32(p2q2);
+  sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4));
+  const uint8x8_t q3p3 = Transpose32(p3q3);
+  sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3));
+  const uint8x8_t q4p4 = Transpose32(p4q4);
+  sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrn_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2));
+  const uint8x8_t q5p5 = Transpose32(p5q5);
+  sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrn_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       const int outer_thresh, const int inner_thresh,
+                       const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+
+  const uint8x8_t p6_v = Load4(dst - 7 * stride);
+  const uint8x8_t p5_v = Load4(dst - 6 * stride);
+  const uint8x8_t p4_v = Load4(dst - 5 * stride);
+  const uint8x8_t p3_v = Load4(dst - 4 * stride);
+  const uint8x8_t p2_v = Load4(dst - 3 * stride);
+  const uint8x8_t p1_v = Load4(dst - 2 * stride);
+  const uint8x8_t p0_v = Load4(dst - stride);
+  const uint8x8_t p0q0 = Load4<1>(dst, p0_v);
+  const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v);
+  const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v);
+  const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v);
+  const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v);
+  const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v);
+  const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v);
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p1q1;
+  uint8x8_t f_p0q0;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t f8_p1q1, f8_p0q0;
+  uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0;
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    const uint8x8_t zero = vdup_n_u8(0);
+    f8_p1q1 = zero;
+    f8_p0q0 = zero;
+    f14_p1q1 = zero;
+    f14_p0q0 = zero;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      const uint8x8_t zero = vdup_n_u8(0);
+      f14_p2q2 = zero;
+      f14_p1q1 = zero;
+      f14_p0q0 = zero;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      const uint8x8_t p5q5_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      StoreLo4(dst - 6 * stride, p5q5_output);
+      StoreHi4(dst + 5 * stride, p5q5_output);
+
+      const uint8x8_t p4q4_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      StoreLo4(dst - 5 * stride, p4q4_output);
+      StoreHi4(dst + 4 * stride, p4q4_output);
+
+      const uint8x8_t p3q3_output =
+          vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      StoreLo4(dst - 4 * stride, p3q3_output);
+      StoreHi4(dst + 3 * stride, p3q3_output);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+
+    uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+    StoreLo4(dst - 3 * stride, p2q2_output);
+    StoreHi4(dst + 2 * stride, p2q2_output);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  StoreLo4(dst - 2 * stride, p1q1_output);
+  StoreHi4(dst + stride, p1q1_output);
+
+  uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+  StoreLo4(dst - stride, p0q0_output);
+  StoreHi4(dst, p0q0_output);
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
+                     const int outer_thresh, const int inner_thresh,
+                     const int hev_thresh) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  dst -= 8;
+  // input
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  const uint8x16_t x0 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x1 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x2 = vld1q_u8(dst);
+  dst += stride;
+  const uint8x16_t x3 = vld1q_u8(dst);
+  dst -= (stride * 3);
+
+  // re-order input
+#if defined(__aarch64__)
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+  const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4);
+
+  uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0);
+  uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0);
+  uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0);
+  uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0);
+#else
+  const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607);
+  const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203);
+
+  const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0);
+  const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0);
+  const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0);
+  const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0);
+
+  const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4);
+  const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4);
+  const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4);
+  const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4);
+
+  const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4);
+  const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4);
+  const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4);
+  const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4);
+#endif
+  // input after re-order
+  // p0 p1 p2 p3 q0 q1 q2 q3  p4 p5 p6 p7 q4 q5 q6 q7
+
+  const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1);
+  const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3);
+  const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]),
+                                      vreinterpretq_u16_u8(in23.val[0]));
+  const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]),
+                                      vreinterpretq_u16_u8(in23.val[1]));
+
+  const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0]));
+  const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0]));
+
+  const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1]));
+  const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1]));
+
+  uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh,
+               &needs_filter8_mask, &is_flat4_mask, &hev_mask);
+
+  needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask);
+  is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask);
+  is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask);
+  hev_mask = InterleaveLow32(hev_mask, hev_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u8(needs_filter8_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Decide between Filter8() and Filter14().
+  uint8x8_t is_flat_outer4_mask =
+      IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6));
+  is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask);
+  is_flat_outer4_mask =
+      InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask);
+
+  uint8x8_t f_p0q0, f_p1q1;
+  const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1);
+  Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0);
+  // Reset the outer values if only a Hev() mask was required.
+  f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1);
+
+  uint8x8_t p1q1_output, p0q0_output;
+  uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output;
+
+#if defined(__aarch64__)
+  if (vaddv_u8(is_flat4_mask) == 0) {
+    // Filter8() and Filter14() do not apply.
+    p1q1_output = p1q1;
+    p0q0_output = p0q0;
+
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+  } else {
+#endif  // defined(__aarch64__)
+    uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+
+#if defined(__aarch64__)
+    if (vaddv_u8(is_flat_outer4_mask) == 0) {
+      // Filter14() does not apply.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = f8_p2q2;
+      p1q1_output = f8_p1q1;
+      p0q0_output = f8_p0q0;
+    } else {
+#endif  // defined(__aarch64__)
+      uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+
+      p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5);
+      p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4);
+      p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3);
+      p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2);
+      p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1);
+      p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0);
+#if defined(__aarch64__)
+    }
+#endif  // defined(__aarch64__)
+    p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2);
+#if defined(__aarch64__)
+  }
+#endif  // defined(__aarch64__)
+
+  p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1);
+  p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1);
+  p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0);
+  p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0);
+
+  const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output);
+  const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6);
+  const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output);
+  const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7);
+
+  const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4),
+                                       vreinterpretq_u16_u8(p2q2_p6q6));
+  const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5),
+                                       vreinterpretq_u16_u8(p3q3_p7q7));
+  const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]),
+                                      vreinterpretq_u8_u16(out13.val[0]));
+  const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]),
+                                      vreinterpretq_u8_u16(out13.val[1]));
+
+#if defined(__aarch64__)
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+  const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0);
+
+  const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7);
+  const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7);
+  const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7);
+  const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7);
+#else
+  const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b);
+  const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504);
+
+  const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0);
+  const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0);
+  const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0);
+  const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0);
+
+  const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0);
+  const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0);
+  const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0);
+  const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0);
+
+  const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0);
+  const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0);
+  const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0);
+  const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0);
+#endif
+
+  vst1q_u8(dst, output_0);
+  dst += stride;
+  vst1q_u8(dst, output_1);
+  dst += stride;
+  vst1q_u8(dst, output_2);
+  dst += stride;
+  vst1q_u8(dst, output_3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+                                 const uint16x4_t q0, const uint16x4_t q1,
+                                 const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+                               const uint16x8_t abd_p1p2_q1q2,
+                               const uint16x8_t abd_p2p3_q2q3,
+                               const uint16_t inner_thresh,
+                               const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const hev_mask,
+                         uint16x4_t* const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where NeedsFilter4() is not true and so Filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+                          const uint16x8_t abd_p0p2_q0q2) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                         const uint16x8_t p0q0, const uint16_t hev_thresh,
+                         const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter6_mask,
+                         uint16x4_t* const is_flat3_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+  *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                     inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+                          const uint16x8_t abd_pn1p0_qn1q0,
+                          const uint16x8_t abd_pn2p0_qn2q0) {
+  constexpr int flat_thresh = 1 << 2;
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                         const uint16x8_t p1q1, const uint16x8_t p0q0,
+                         const uint16_t hev_thresh, const uint16x4_t outer_mask,
+                         const uint16_t inner_thresh,
+                         uint16x4_t* const needs_filter8_mask,
+                         uint16x4_t* const is_flat4_mask,
+                         uint16x4_t* const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t is_flat4 =
+      IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+  *needs_filter8_mask =
+      NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                   inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of Filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use Filter8. Because Filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                    const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                    uint16x8_t* const p1q1_result,
+                    uint16x8_t* const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+  *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+  const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+                             vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test, but may not come up often
+  // enough to warrant it.
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  const uint64x1_t needs_filter4_mask64 =
+      vreinterpret_u64_u16(needs_filter4_mask);
+  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  auto* dst = static_cast<uint8_t*>(dest) - 4;
+  auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+  auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+  auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+  uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                       vld1_u16(dst_q1)};
+  Transpose4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+               &needs_filter4_mask);
+
+#if defined(__aarch64__)
+  // This provides a good speedup for the unit test. Not sure how applicable it
+  // is to valid streams though.
+  // Consider doing this on armv7 if there is a quick way to check if a vector
+  // is zero.
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  const uint64x1_t needs_filter4_mask64 =
+      vreinterpret_u64_u16(needs_filter4_mask);
+  if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+  // Already integrated the Hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                    const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+  const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+                             vld1_u16(dst_p0), vld1_u16(dst_q0),
+                             vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  // Left side of the filter window.
+  auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                           vld1q_u16(dst_3)};
+  Transpose4x8(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+      vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+      vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+      vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+  // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // Filter6() does not apply, but Filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+      vget_low_u16(p1q1_output),
+      vget_low_u16(p0q0_output),
+      vget_high_u16(p0q0_output),
+      vget_high_u16(p1q1_output),
+  };
+  Transpose4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                    const uint16x8_t p1q1, const uint16x8_t p0q0,
+                    uint16x8_t* const p2q2_output,
+                    uint16x8_t* const p1q1_output,
+                    uint16x8_t* const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+                      int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  const uint16x4_t src[8] = {
+      vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+      vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                    int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                       vld1q_u16(dst_3)};
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  LoopFilterTranspose4x8(src);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() does not apply, but Filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  Transpose4x8(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+  vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+  vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+  vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+                     const uint16x8_t p4q4, const uint16x8_t p3q3,
+                     const uint16x8_t p2q2, const uint16x8_t p1q1,
+                     const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+                     uint16x8_t* const p4q4_output,
+                     uint16x8_t* const p3q3_output,
+                     uint16x8_t* const p2q2_output,
+                     uint16x8_t* const p1q1_output,
+                     uint16x8_t* const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = Transpose64(p0q0);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = Transpose64(p1q1);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = Transpose64(p2q2);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = Transpose64(p3q3);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = Transpose64(p4q4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = Transpose64(p5q5);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+                       int outer_thresh, int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+  auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+  auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+  auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+  auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+  auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+  auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+  auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+  auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+  auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+  auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+  const uint16x4_t src[14] = {
+      vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+      vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+      vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+      vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask =
+      OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if defined(__aarch64__)
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // defined(__aarch64__)
+  return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+                     int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+  auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+  auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+  auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+  auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                         vld1q_u16(dst_3)};
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  Transpose4x8(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                         vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  Transpose4x8(src_q);
+
+  // Adjust thresholds to bitdepth.
+  outer_thresh <<= 2;
+  inner_thresh <<= 2;
+  hev_thresh <<= 2;
+  const uint16x4_t outer_mask = OuterThreshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+               &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#else   // !defined(__aarch64__)
+  // This might be faster than vaddv (latency 3) because mov to general register
+  // has latency 2.
+  const uint64x1_t needs_filter_mask64 =
+      vreinterpret_u64_u16(needs_filter_mask);
+  if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // defined(__aarch64__)
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of Filter8 and Filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // Filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                             vabdq_u16(p0q0, p6q6)));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+  // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                            p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+  Transpose4x8(output_p);
+  uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                            p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+  Transpose4x8(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Horizontal4_NEON;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Horizontal6_NEON;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Horizontal8_NEON;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14_NEON;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Vertical14_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void LoopFilterInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
new file mode 100644
index 0000000..540defc
--- /dev/null
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_
diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc
new file mode 100644
index 0000000..410bc20
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_10bit_neon.cc
@@ -0,0 +1,2652 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+}
+
+inline int32x4x2_t WienerHorizontal2(const uint16x8_t s0, const uint16x8_t s1,
+                                     const int16_t filter,
+                                     const int32x4x2_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddq_u16(s0, s1));
+  int32x4x2_t res;
+  res.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(ss), filter);
+  res.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(ss), filter);
+  return res;
+}
+
+inline void WienerHorizontalSum(const uint16x8_t s[3], const int16_t filter[4],
+                                int32x4x2_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (kBitdepth10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[2]));
+  const int16x8_t s_1 = vreinterpretq_s16_u16(s[1]);
+  int16x4x2_t sum16;
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_0_2), filter[2]);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(s_1), filter[3]);
+  sum16.val[0] = vqshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal);
+  sum16.val[0] = vmax_s16(sum16.val[0], vdup_n_s16(-offset));
+  sum16.val[0] = vmin_s16(sum16.val[0], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer, sum16.val[0]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_0_2), filter[2]);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(s_1), filter[3]);
+  sum16.val[1] = vqshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal);
+  sum16.val[1] = vmax_s16(sum16.val[1], vdup_n_s16(-offset));
+  sum16.val[1] = vmin_s16(sum16.val[1], vdup_n_s16(limit - offset));
+  vst1_s16(wiener_buffer + 4, sum16.val[1]);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t wiener_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  const ptrdiff_t src_width =
+      width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[8];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = wiener_stride;
+    ptrdiff_t valid_bytes = src_width * 2;
+    do {
+      src_ptr += 8;
+      valid_bytes -= 16;
+      s[7] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+      s[1] = vextq_u16(s[0], s[7], 1);
+      s[2] = vextq_u16(s[0], s[7], 2);
+      s[3] = vextq_u16(s[0], s[7], 3);
+      s[4] = vextq_u16(s[0], s[7], 4);
+      s[5] = vextq_u16(s[0], s[7], 5);
+      s[6] = vextq_u16(s[0], s[7], 6);
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t wiener_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  const ptrdiff_t src_width =
+      width + ((kRestorationHorizontalBorder - 1) * sizeof(*src));
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[6];
+    s[0] = vld1q_u16(src_ptr);
+    ptrdiff_t x = wiener_stride;
+    ptrdiff_t valid_bytes = src_width * 2;
+    do {
+      src_ptr += 8;
+      valid_bytes -= 16;
+      s[5] = Load1QMsanU16(src_ptr, 16 - valid_bytes);
+      s[1] = vextq_u16(s[0], s[5], 1);
+      s[2] = vextq_u16(s[0], s[5], 2);
+      s[3] = vextq_u16(s[0], s[5], 3);
+      s[4] = vextq_u16(s[0], s[5], 4);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint16_t* src_ptr = src;
+    uint16x8_t s[3];
+    ptrdiff_t x = width;
+    do {
+      s[0] = vld1q_u16(src_ptr);
+      s[1] = vld1q_u16(src_ptr + 1);
+      s[2] = vld1q_u16(src_ptr + 2);
+
+      int32x4x2_t sum;
+      sum.val[0] = sum.val[1] =
+          vdupq_n_s32(1 << (kInterRoundBitsHorizontal - 1));
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      src_ptr += 8;
+      *wiener_buffer += 8;
+      x -= 8;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src + x);
+      const int16x8_t d = vreinterpretq_s16_u16(vshlq_n_u16(s, 4));
+      vst1q_s16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a0), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a0), filter);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a1), filter);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a1), filter);
+  return d;
+}
+
+inline uint16x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                 const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vcombine_u16(sum_lo_16, sum_hi_16);
+}
+
+inline uint16x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap7Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint16x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap5Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint16x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint16x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                           const ptrdiff_t wiener_stride,
+                                           const int16_t filter[4],
+                                           int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint16x8x2_t WienerVerticalTap3Kernel2(
+    const int16_t* const wiener_buffer, const ptrdiff_t wiener_stride,
+    const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint16x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint16x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+
+      vst1q_u16(dst_ptr, vminq_u16(d[0].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8, vminq_u16(d[1].val[0], v_max_bitdepth));
+      vst1q_u16(dst_ptr + dst_stride, vminq_u16(d[0].val[1], v_max_bitdepth));
+      vst1q_u16(dst_ptr + 8 + dst_stride,
+                vminq_u16(d[1].val[1], v_max_bitdepth));
+
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint16x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint16x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u16(dst, vminq_u16(d0, v_max_bitdepth));
+      vst1q_u16(dst + 8, vminq_u16(d1, v_max_bitdepth));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const uint16x8_t v_max_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const int16x8_t d0 = vrshrq_n_s16(a0, 4);
+  const int16x8_t d1 = vrshrq_n_s16(a1, 4);
+  vst1q_u16(dst, vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d0, vdupq_n_s16(0))),
+                           v_max_bitdepth));
+  vst1q_u16(dst + 8,
+            vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(d1, vdupq_n_s16(0))),
+                      v_max_bitdepth));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint16_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, width, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, width, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride, width,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, width, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, width, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride, width,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               uint16x8_t dst[2]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint16x8_t dst[2]) {
+  dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               uint16x8_t dst[3]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+  dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint16x8_t dst[3]) {
+  dst[0] = Load1QMsanU16(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = Load1QMsanU16(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = Load1QMsanU16(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4_t dst[2]) {
+  dst[0] = vld1q_u32(src + 0);
+  dst[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, uint32x4_t dst[2]) {
+  dst[0] = Load1QMsanU32(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = Load1QMsanU32(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               uint32x4_t dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint32x4_t dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               uint32x4_t dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   uint32x4_t dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+  vst1q_u16(dst + 0, src[0]);
+  vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4_t src[2]) {
+  vst1q_u32(dst + 0, src[0]);
+  vst1q_u32(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4_t src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+inline uint16x8_t VaddwLo8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_low_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint16x8_t VaddwHi8(const uint16x8_t src0, const uint8x16_t src1) {
+  const uint8x8_t s1 = vget_high_u8(src1);
+  return vaddw_u8(src0, s1);
+}
+
+inline uint32x4_t VmullLo16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_low_u16(src0), vget_low_u16(src1));
+}
+
+inline uint32x4_t VmullHi16(const uint16x8_t src0, const uint16x8_t src1) {
+  return vmull_u16(vget_high_u16(src0), vget_high_u16(src1));
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+inline uint32x4_t Square(uint16x4_t s) { return vmull_u16(s, s); }
+
+inline void Square(const uint16x8_t src, uint32x4_t dst[2]) {
+  const uint16x4_t s_lo = vget_low_u16(src);
+  const uint16x4_t s_hi = vget_high_u16(src);
+  dst[0] = Square(s_lo);
+  dst[1] = Square(s_hi);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x8_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x8_t dst[5]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u16(src[0], src[1], 1);
+  dst[2] = vextq_u16(src[0], src[1], 2);
+  dst[3] = vextq_u16(src[0], src[1], 3);
+  dst[4] = vextq_u16(src[0], src[1], 4);
+}
+
+inline void Prepare3_32(const uint32x4_t src[2], uint32x4_t dst[3]) {
+  dst[0] = src[0];
+  dst[1] = vextq_u32(src[0], src[1], 1);
+  dst[2] = vextq_u32(src[0], src[1], 2);
+}
+
+inline void Prepare5_32(const uint32x4_t src[2], uint32x4_t dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = vextq_u32(src[0], src[1], 3);
+  dst[4] = src[1];
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const uint32x4_t src[3][2], uint32x4_t dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t* src0, const uint32x4_t* src1,
+                          const uint32x4_t* src2, const uint32x4_t* src3,
+                          const uint32x4_t* src4) {
+  const uint32x4_t sum01 = vaddq_u32(*src0, *src1);
+  const uint32x4_t sum23 = vaddq_u32(*src2, *src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, *src4);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const uint32x4_t src[5][2], uint32x4_t dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline uint16x8_t Sum3Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline uint16x8_t Sum5Horizontal16(const uint16x8_t src[2]) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const uint16x8_t src[2], uint16x8_t* const row3,
+                     uint16x8_t* const row5) {
+  uint16x8_t s[5];
+  Prepare5_16(src, s);
+  const uint16x8_t sum04 = vaddq_u16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16x8_t src[3], uint16x8_t* const row3_0,
+                            uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                            uint16x8_t* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const uint32x4_t src[5], uint32x4_t* const row_sq3,
+                     uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddq_u32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const uint32x4_t src[3],
+                            uint32x4_t* const row_sq3_0,
+                            uint32x4_t* const row_sq3_1,
+                            uint32x4_t* const row_sq5_0,
+                            uint32x4_t* const row_sq5_1) {
+  uint32x4_t s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline uint16x8_t Sum343Lo(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WLo16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline uint16x8_t Sum343Hi(const uint8x16_t ma3[3]) {
+  const uint16x8_t sum = Sum3WHi16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline uint32x4_t Sum343(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddq_u32(sum3, src[1]);
+}
+
+inline void Sum343(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline uint16x8_t Sum565Lo(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WLo16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline uint16x8_t Sum565Hi(const uint8x16_t src[3]) {
+  const uint16x8_t sum = Sum3WHi16(src);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline uint32x4_t Sum565(const uint32x4_t src[3]) {
+  const uint32x4_t sum = Sum3_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddq_u32(sum5, src[1]);
+}
+
+inline void Sum565(const uint32x4_t src[3], uint32x4_t dst[2]) {
+  uint32x4_t s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = Load1QMsanU16(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row3[2], row5[2];
+      uint32x4_t row_sq3[2], row_sq5[2];
+      s[1] = Load1QMsanU16(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = Load1QMsanU16(src,
+                           overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    uint16x8_t s[3];
+    uint32x4_t sq[6];
+    s[0] = Load1QMsanU16(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row[2];
+      uint32x4_t row_sq[4];
+      s[1] = Load1QMsanU16(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = Load1QMsanU16(src,
+                           overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+template <int n>
+inline uint16x8_t CalculateMa(const uint16x8_t sum, const uint32x4_t sum_sq[2],
+                              const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const uint16x8_t b = vrshrq_n_u16(sum, 2);
+  const uint16x4_t sum_lo = vget_low_u16(b);
+  const uint16x4_t sum_hi = vget_high_u16(b);
+  const uint16x4_t z0 =
+      CalculateMa<n>(sum_lo, vrshrq_n_u32(sum_sq[0], 4), scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(sum_hi, vrshrq_n_u32(sum_sq[1], 4), scale);
+  return vcombine_u16(z0, z1);
+}
+
+inline void CalculateB5(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const uint32x4_t m2 = VmullLo16(ma, sum);
+  const uint32x4_t m3 = VmullHi16(ma, sum);
+  const uint32x4_t m0 = vmulq_n_u32(m2, one_over_n_quarter);
+  const uint32x4_t m1 = vmulq_n_u32(m3, one_over_n_quarter);
+  b[0] = vrshrq_n_u32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = vrshrq_n_u32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const uint16x8_t sum, const uint16x8_t ma,
+                        uint32x4_t b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const uint32x4_t m0 = VmullLo16(ma, sum);
+  const uint32x4_t m1 = VmullHi16(ma, sum);
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  b[0] = vrshrq_n_u32(m2, kSgrProjReciprocalBits);
+  b[1] = vrshrq_n_u32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex3(const uint16x8_t s3[3],
+                                  const uint32x4_t sq3[3][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const uint16x8_t s5[5],
+                                  const uint32x4_t sq5[5][2],
+                                  const uint32_t scale, uint16x8_t* const sum,
+                                  uint16x8_t* const index) {
+  uint32x4_t sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
+                               uint8x16_t* const ma, uint32x4_t b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+
+  const uint8x8_t idx = vqmovn_u16(index);
+  uint8_t temp[8];
+  vst1_u8(temp, idx);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[4]], *ma, offset + 4);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[5]], *ma, offset + 5);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[6]], *ma, offset + 6);
+  *ma = vsetq_lane_u8(kSgrMaLookup[temp[7]], *ma, offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+inline uint8x8_t MaLookupAndAdjust(const uint8x8x4_t table0,
+                                   const uint8x8x2_t table1,
+                                   const uint16x8_t index) {
+  const uint8x8_t idx = vqmovn_u16(index);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t sub_idx = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, sub_idx);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  return val;
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2],
+                                  uint8x16_t* const ma, uint32x4_t b0[2],
+                                  uint32x4_t b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  const uint8x8_t ma_lo = MaLookupAndAdjust(table0, table1, index[0]);
+  const uint8x8_t ma_hi = MaLookupAndAdjust(table0, table1, index[1]);
+  *ma = vcombine_u8(ma_lo, ma_hi);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq0 = vmovl_u8(vget_low_u8(*ma));
+  CalculateB3(sum[0], maq0, b0);
+  const uint16x8_t maq1 = vmovl_u8(vget_high_u8(*ma));
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const uint16x8_t sum[2],
+                                  const uint16x8_t index[2], uint8x16_t ma[2],
+                                  uint32x4_t b[4]) {
+  uint8x16_t mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = vcombine_u8(vget_low_u8(ma[0]), vget_low_u8(mas));
+  ma[1] = vextq_u8(mas, vdupq_n_u8(0), 8);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4_t sq5[5][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  uint16x8_t sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4_t sq3[3][2],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint32x4_t b[2]) {
+  uint16x8_t sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const uint32x4_t b3[3], const ptrdiff_t x,
+                         uint32x4_t sum_b343[2], uint32x4_t sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  uint32x4_t b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = vshlq_n_u32(sum_b111[0], 2);
+  sum_b343[0] = vsubq_u32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = vaddq_u32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = vshlq_n_u32(sum_b111[1], 2);
+  sum_b343[1] = vsubq_u32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = vaddq_u32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[3],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint16x8_t* const sum_ma444, uint32x4_t sum_b343[2],
+                           uint32x4_t sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                           uint32x4_t sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4_t sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const uint8x16_t ma3[3], const uint32x4_t b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4_t sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint16x8_t s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  vst1q_u16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  vst1q_u16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16x8_t s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint16x8_t s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[4],
+    uint8x16_t* const ma, uint32x4_t b[2]) {
+  uint16x8_t s5[5];
+  uint32x4_t sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s5[2][5];
+  uint32x4_t sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint16x8_t s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[4], uint8x16_t* const ma,
+    uint32x4_t b[2]) {
+  uint16x8_t s3[3];
+  uint32x4_t sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  vst1q_u16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16x8_t s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint32x4_t sq[8], uint8x16_t ma[2],
+    uint32x4_t b[6]) {
+  uint16x8_t s3[4], sum[2], index[2];
+  uint32x4_t sq3[3][2];
+
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint16x8_t s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t* const ma5, uint32x4_t b5[2]) {
+  uint16x8_t s3[4], s5[5], sum[2], index[2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = vextq_u8(ma3[0][0], vdupq_n_u8(0), 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16x8_t s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint32x4_t sq[2][8], uint8x16_t ma3[2][2],
+    uint32x4_t b3[2][6], uint8x16_t ma5[2], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][4], s5[2][5], sum[2][2], index[2][2];
+  uint32x4_t sq3[4][2], sq5[5][2];
+
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  vst1q_u16(sum3[2] + x + 0, s3[0][2]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum5[3] + x + 0, s5[0][3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  vst1q_u16(sum3[3] + x + 0, s3[0][3]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u16(sum5[4] + x + 0, s5[0][4]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint16x8_t s[2], const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint32x4_t sq[4], uint8x16_t* const ma3, uint8x16_t* const ma5,
+    uint32x4_t b3[2], uint32x4_t b5[2]) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16x8_t s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], uint32x4_t sq[8], uint8x16_t ma3[2],
+    uint8x16_t ma5[2], uint32x4_t b3[6], uint32x4_t b5[6]) {
+  uint16x8_t s3[2][3], s5[2][5], sum[2], index[2];
+  uint32x4_t sq3[3][2], sq5[5][2];
+
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint8x16_t ma5[3];
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  mas[0] = mas[1] = vdupq_n_u8(0);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = Load1QMsanU16(src + x + 16,
+                         overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = Load1QMsanU16(src + x + 24,
+                         overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint8x16_t ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4_t b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[4];
+    uint8x16_t ma3x[3], ma5x[3];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint32x4_t ma_x_src, const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vsubq_u32(b, ma_x_src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vqrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint16x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4_t b[2]) {
+  const uint32x4_t ma_x_src_lo = VmullLo16(ma, src);
+  const uint32x4_t ma_x_src_hi = VmullHi16(ma, src);
+  const int16x4_t dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const int16x4_t dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint16x8_t src,
+                                              const uint16x8_t ma[2],
+                                              const uint32x4_t b[2][2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4_t b_sum[2];
+  b_sum[0] = vaddq_u32(b[0][0], b[1][0]);
+  b_sum[1] = vaddq_u32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint16x8_t src,
+                                              const uint16x8_t ma[3],
+                                              const uint32x4_t b[3][2]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  uint32x4_t b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline int16x8_t SelfGuidedFinal(const uint16x8_t src, const int32x4_t v[2]) {
+  const int16x4_t v_lo =
+      vqrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vqrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  return vaddq_s16(vreinterpretq_s16_u16(src), vv);
+}
+
+inline int16x8_t SelfGuidedDoubleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  return SelfGuidedFinal(src, v);
+}
+
+inline int16x8_t SelfGuidedSingleMultiplier(const uint16x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const int16x8_t val) {
+  const uint16x8_t val0 = vreinterpretq_u16_s16(vmaxq_s16(val, vdupq_n_s16(0)));
+  const uint16x8_t val1 = vminq_u16(val0, vdupq_n_u16((1 << kBitdepth10) - 1));
+  vst1q_u16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[2][8], bs[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+    int16x8_t p[2];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x + 0);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x + 0);
+    ma[0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const int16x8_t d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const uint16x8_t sr0_hi = vld1q_u16(src + x + 8);
+    const uint16x8_t sr1_hi = vld1q_u16(src + stride + x + 8);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const int16x8_t d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint32x4_t b[2][2];
+    uint8x16_t ma5[3];
+
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = vld1q_u16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    int16x8_t p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = vld1q_u16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t mas[2];
+  uint32x4_t sq[8], bs[6];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  mas[0] = mas[1] = vdupq_n_u8(0);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    uint16x8_t ma[3];
+    uint32x4_t b[3][2];
+    uint8x16_t ma3[3];
+
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const uint16x8_t sr_hi = vld1q_u16(src + x + 8);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const int16x8_t d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[2][4];
+  uint8x16_t ma3[2][2], ma5[2];
+  uint32x4_t sq[2][8], b3[2][6], b5[6];
+
+  s[0][0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = Load1QMsanU16(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = Load1QMsanU16(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3][3];
+    uint32x4_t b[3][3][2];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    int16x8_t p[2][2];
+
+    s[0][2] = Load1QMsanU16(src0 + x + 16,
+                            overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = Load1QMsanU16(src0 + x + 24,
+                            overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = Load1QMsanU16(src1 + x + 16,
+                            overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = Load1QMsanU16(src1 + x + 24,
+                            overread_in_bytes + sizeof(*src1) * (x + 24));
+
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const uint16x8_t sr0_lo = vld1q_u16(src + x);
+    const uint16x8_t sr1_lo = vld1q_u16(src + stride + x);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const int16x8_t d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const int16x8_t d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const uint16x8_t sr0_hi = Load1QMsanU16(
+        src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    const uint16x8_t sr1_hi = Load1QMsanU16(
+        src + stride + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const int16x8_t d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const int16x8_t d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  uint16x8_t s[4];
+  uint8x16_t ma3[2], ma5[2];
+  uint32x4_t sq[8], b3[6], b5[6];
+  uint16x8_t ma[3];
+  uint32x4_t b[3][2];
+
+  s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
+  s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  // Quiet "may be used uninitialized" warning.
+  ma3[0] = ma3[1] = vdupq_n_u8(0);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+
+    s[2] = Load1QMsanU16(src0 + x + 16,
+                         overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = Load1QMsanU16(src0 + x + 24,
+                         overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const uint16x8_t sr_lo = vld1q_u16(src + x + 0);
+    ma[0] = vld1q_u16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const int16x8_t d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const uint16x8_t sr_hi = Load1QMsanU16(
+        src + x + 8, overread_in_bytes + 4 + sizeof(*src) * (x + 8));
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const int16x8_t d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* top = static_cast<const uint16_t*>(top_border);
+  const auto* bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
new file mode 100644
index 0000000..2db137f
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -0,0 +1,2424 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8x2_t src) {
+  return vext_u8(src.val[0], src.val[1], bytes);
+}
+
+template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8x2_t src) {
+  return vextq_u16(src.val[0], src.val[1], bytes / 2);
+}
+
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
+// Wiener
+
+// Must make a local copy of coefficients to help compiler know that they have
+// no overlap with other buffers. Using 'const' keyword is not enough. Actually
+// compiler doesn't make a copy, since there is enough registers in this case.
+inline void PopulateWienerCoefficients(
+    const RestorationUnitInfo& restoration_info, const int direction,
+    int16_t filter[4]) {
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  for (int i = 0; i < 4; ++i) {
+    filter[i] = restoration_info.wiener_info.filter[direction][i];
+  }
+  if (direction == WienerInfo::kHorizontal) {
+    filter[3] -= 128;
+  }
+}
+
+inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1,
+                                   const int16_t filter, const int16x8_t sum) {
+  const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1));
+  return vmlaq_n_s16(sum, ss, filter);
+}
+
+inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1,
+                                     const int16_t filter,
+                                     const int16x8x2_t sum) {
+  int16x8x2_t d;
+  d.val[0] =
+      WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]);
+  d.val[1] =
+      WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]);
+  return d;
+}
+
+inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4],
+                                int16x8_t sum, int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2]));
+  const int16x8_t s_1 = ZeroExtend(s[1]);
+  sum = vmlaq_n_s16(sum, s_0_2, filter[2]);
+  sum = vmlaq_n_s16(sum, s_1, filter[3]);
+  // Calculate scaled down offset correction, and add to sum here to prevent
+  // signed 16 bit outranging.
+  sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum,
+                     kInterRoundBitsHorizontal);
+  sum = vmaxq_s16(sum, vdupq_n_s16(-offset));
+  sum = vminq_s16(sum, vdupq_n_s16(limit - offset));
+  vst1q_s16(wiener_buffer, sum);
+}
+
+inline void WienerHorizontalSum(const uint8x16_t src[3],
+                                const int16_t filter[4], int16x8x2_t sum,
+                                int16_t* const wiener_buffer) {
+  uint8x8_t s[3];
+  s[0] = vget_low_u8(src[0]);
+  s[1] = vget_low_u8(src[1]);
+  s[2] = vget_low_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer);
+  s[0] = vget_high_u8(src[0]);
+  s[1] = vget_high_u8(src[1]);
+  s[2] = vget_high_u8(src[2]);
+  WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[8];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[7] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[7], 1);
+      s[2] = vextq_u8(s[0], s[7], 2);
+      s[3] = vextq_u8(s[0], s[7], 3);
+      s[4] = vextq_u8(s[0], s[7], 4);
+      s[5] = vextq_u8(s[0], s[7], 5);
+      s[6] = vextq_u8(s[0], s[7], 6);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[6], filter[0], sum);
+      sum = WienerHorizontal2(s[1], s[5], filter[1], sum);
+      WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer);
+      s[0] = s[7];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[6];
+    s[0] = vld1q_u8(src_ptr);
+    ptrdiff_t x = width;
+    do {
+      src_ptr += 16;
+      s[5] = vld1q_u8(src_ptr);
+      s[1] = vextq_u8(s[0], s[5], 1);
+      s[2] = vextq_u8(s[0], s[5], 2);
+      s[3] = vextq_u8(s[0], s[5], 3);
+      s[4] = vextq_u8(s[0], s[5], 4);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      sum = WienerHorizontal2(s[0], s[4], filter[1], sum);
+      WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer);
+      s[0] = s[5];
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int16_t filter[4],
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    uint8x16_t s[3];
+    ptrdiff_t x = width;
+    do {
+      // Slightly faster than using vextq_u8().
+      s[0] = vld1q_u8(src_ptr);
+      s[1] = vld1q_u8(src_ptr + 1);
+      s[2] = vld1q_u8(src_ptr + 2);
+      int16x8x2_t sum;
+      sum.val[0] = sum.val[1] = vdupq_n_s16(0);
+      WienerHorizontalSum(s, filter, sum, *wiener_buffer);
+      src_ptr += 16;
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    const uint8_t* src_ptr = src;
+    ptrdiff_t x = width;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr);
+      const uint8x8_t s0 = vget_low_u8(s);
+      const uint8x8_t s1 = vget_high_u8(s);
+      const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4));
+      const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4));
+      vst1q_s16(*wiener_buffer + 0, d0);
+      vst1q_s16(*wiener_buffer + 8, d1);
+      src_ptr += 16;
+      *wiener_buffer += 16;
+      x -= 16;
+    } while (x != 0);
+    src += src_stride;
+  }
+}
+
+inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
+                                   const int16_t filter,
+                                   const int32x4x2_t sum) {
+  const int16x8_t a = vaddq_s16(a0, a1);
+  int32x4x2_t d;
+  d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter);
+  d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter);
+  return d;
+}
+
+inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4],
+                                const int32x4x2_t sum) {
+  int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum);
+  d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]);
+  d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]);
+  const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11);
+  const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11);
+  return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16));
+}
+
+inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[7]) {
+  int32x4x2_t sum;
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[6], filter[0], sum);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  return WienerVertical(a + 2, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[8];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[7], filter[0], sum);
+  sum = WienerVertical2(a[2], a[6], filter[1], sum);
+  d.val[1] = WienerVertical(a + 3, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[7];
+      const uint8x8_t d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[5]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[0], a[4], filter[1], sum);
+  return WienerVertical(a + 1, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[6];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  sum = WienerVertical2(a[1], a[5], filter[1], sum);
+  d.val[1] = WienerVertical(a + 2, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[5];
+      const uint8x8_t d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer,
+                                          const ptrdiff_t wiener_stride,
+                                          const int16_t filter[4],
+                                          int16x8_t a[3]) {
+  a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride);
+  a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride);
+  a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride);
+  int32x4x2_t sum;
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  return WienerVertical(a, filter, sum);
+}
+
+inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer,
+                                             const ptrdiff_t wiener_stride,
+                                             const int16_t filter[4]) {
+  int16x8_t a[4];
+  int32x4x2_t sum;
+  uint8x8x2_t d;
+  d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride);
+  sum.val[0] = sum.val[1] = vdupq_n_s32(0);
+  d.val[1] = WienerVertical(a + 1, filter, sum);
+  return d;
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t filter[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      uint8x8x2_t d[2];
+      d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter);
+      d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0]));
+      vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1]));
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      int16x8_t a[3];
+      const uint8x8_t d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a);
+      const uint8x8_t d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a);
+      vst1q_u8(dst, vcombine_u8(d0, d1));
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const int16x8_t a0 = vld1q_s16(wiener_buffer + 0);
+  const int16x8_t a1 = vld1q_s16(wiener_buffer + 8);
+  const uint8x8_t d0 = vqrshrun_n_s16(a0, 4);
+  const uint8x8_t d1 = vqrshrun_n_s16(a1, 4);
+  vst1q_u8(dst, vcombine_u8(d0, d1));
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y != 0; --y) {
+    uint8_t* dst_ptr = dst;
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst_ptr);
+      WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride);
+      wiener_buffer += 16;
+      dst_ptr += 16;
+      x -= 16;
+    } while (x != 0);
+    wiener_buffer += width;
+    dst += 2 * dst_stride;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = width;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer, dst);
+      wiener_buffer += 16;
+      dst += 16;
+      x -= 16;
+    } while (x != 0);
+  }
+}
+
+// For width 16 and up, store the horizontal results, and then do the vertical
+// filter row by row. This is faster than doing it column by column when
+// considering cache issues.
+void WienerFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+  int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2];
+  int16_t filter_vertical[(kWienerFilterTaps + 1) / 2];
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal,
+                             filter_horizontal);
+  PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical,
+                             filter_vertical);
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 2;
+constexpr int kOverreadInBytesPass2 = 4;
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kWideOverreadInBytesPass1 = 10;
+constexpr int kWideOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               uint16x8_t dst[2]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               uint16x8_t dst[3]) {
+  dst[0] = vld1q_u16(src[0] + x);
+  dst[1] = vld1q_u16(src[1] + x);
+  dst[2] = vld1q_u16(src[2] + x);
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, uint32x4x2_t* dst) {
+  (*dst).val[0] = vld1q_u32(src + 0);
+  (*dst).val[1] = vld1q_u32(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               uint32x4x2_t dst[2]) {
+  LoadAligned32U32(src[0] + x, &dst[0]);
+  LoadAligned32U32(src[1] + x, &dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               uint32x4x2_t dst[3]) {
+  LoadAligned32U32(src[0] + x, &dst[0]);
+  LoadAligned32U32(src[1] + x, &dst[1]);
+  LoadAligned32U32(src[2] + x, &dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const uint16x8_t src[2]) {
+  vst1q_u16(dst + 0, src[0]);
+  vst1q_u16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const uint32x4x2_t src) {
+  vst1q_u32(dst + 0, src.val[0]);
+  vst1q_u32(dst + 4, src.val[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const uint32x4x2_t src[2]) {
+  vst1q_u32(dst + 0, src[0].val[0]);
+  vst1q_u32(dst + 4, src[0].val[1]);
+  vst1q_u32(dst + 8, src[1].val[0]);
+  vst1q_u32(dst + 12, src[1].val[1]);
+}
+
+inline uint16x8_t SquareLo8(const uint8x8_t src) { return vmull_u8(src, src); }
+
+inline uint16x8_t SquareLo8(const uint8x16_t src) {
+  return vmull_u8(vget_low_u8(src), vget_low_u8(src));
+}
+
+inline uint16x8_t SquareHi8(const uint8x16_t src) {
+  return vmull_u8(vget_high_u8(src), vget_high_u8(src));
+}
+
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+}
+
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
+                        uint16x4_t high[3]) {
+  uint16x8_t s[3];
+  s[0] = VshrU128<0>(src);
+  s[1] = VshrU128<2>(src);
+  s[2] = VshrU128<4>(src);
+  low[0] = vget_low_u16(s[0]);
+  low[1] = vget_low_u16(s[1]);
+  low[2] = vget_low_u16(s[2]);
+  high[0] = vget_high_u16(s[0]);
+  high[1] = vget_high_u16(s[1]);
+  high[2] = vget_high_u16(s[2]);
+}
+
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
+  dst[0] = VshrU128<0>(src);
+  dst[1] = VshrU128<1>(src);
+  dst[2] = VshrU128<2>(src);
+  dst[3] = VshrU128<3>(src);
+  dst[4] = VshrU128<4>(src);
+}
+
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
+                        uint16x4_t high[5]) {
+  Prepare3_16(src, low, high);
+  const uint16x8_t s3 = VshrU128<6>(src);
+  const uint16x8_t s4 = VshrU128<8>(src);
+  low[3] = vget_low_u16(s3);
+  low[4] = vget_low_u16(s4);
+  high[3] = vget_high_u16(s3);
+  high[4] = vget_high_u16(s4);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1,
+                          const uint16x8_t src2) {
+  const uint16x8_t sum = vaddq_u16(src0, src1);
+  return vaddq_u16(sum, src2);
+}
+
+inline uint16x8_t Sum3_16(const uint16x8_t src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2) {
+  const uint32x4_t sum = vaddq_u32(src0, src1);
+  return vaddq_u32(sum, src2);
+}
+
+inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]);
+  d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]);
+  return d;
+}
+
+inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(src[0], src[1]);
+  return vaddw_u8(sum, src[2]);
+}
+
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
+inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
+  const uint32x4_t sum = vaddl_u16(src[0], src[1]);
+  return vaddw_u16(sum, src[2]);
+}
+
+inline uint16x8_t Sum5_16(const uint16x8_t src[5]) {
+  const uint16x8_t sum01 = vaddq_u16(src[0], src[1]);
+  const uint16x8_t sum23 = vaddq_u16(src[2], src[3]);
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddq_u16(sum, src[4]);
+}
+
+inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1,
+                          const uint32x4_t src2, const uint32x4_t src3,
+                          const uint32x4_t src4) {
+  const uint32x4_t sum01 = vaddq_u32(src0, src1);
+  const uint32x4_t sum23 = vaddq_u32(src2, src3);
+  const uint32x4_t sum = vaddq_u32(sum01, sum23);
+  return vaddq_u32(sum, src4);
+}
+
+inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) {
+  uint32x4x2_t d;
+  d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0],
+                     src[4].val[0]);
+  d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1],
+                     src[4].val[1]);
+  return d;
+}
+
+inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) {
+  const uint32x4_t sum01 = vaddl_u16(src[0], src[1]);
+  const uint32x4_t sum23 = vaddl_u16(src[2], src[3]);
+  const uint32x4_t sum0123 = vaddq_u32(sum01, sum23);
+  return vaddw_u16(sum0123, src[4]);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
+  uint8x8_t s[3];
+  Prepare3_8(src, s);
+  return Sum3W_16(s);
+}
+
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+  uint8x16_t s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum;
+  Prepare3_16(src, low, high);
+  sum.val[0] = Sum3W_32(low);
+  sum.val[1] = Sum3W_32(high);
+  return sum;
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
+  const uint16x8_t sum23 = vaddl_u8(s[2], s[3]);
+  const uint16x8_t sum0123 = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum0123, s[4]);
+}
+
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+                           uint16x8_t* const dst1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
+  uint16x4_t low[5], high[5];
+  Prepare5_16(src, low, high);
+  uint32x4x2_t sum;
+  sum.val[0] = Sum5W_32(low);
+  sum.val[1] = Sum5W_32(high);
+  return sum;
+}
+
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+                   uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                   uint16x8_t* const row5_1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+  const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+  *row5_1 = vaddq_u16(sum04_hi, *row3_1);
+}
+
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+                   uint16x8_t* const row5) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+  const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+  *row3 = vaddw_u8(sum12, s[3]);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
+void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
+                   uint32x4_t* const row_sq5) {
+  const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
+  const uint32x4_t sum12 = vaddl_u16(src[1], src[2]);
+  *row_sq3 = vaddw_u16(sum12, src[3]);
+  *row_sq5 = vaddq_u32(sum04, *row_sq3);
+}
+
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+                   uint32x4x2_t* const row_sq5) {
+  uint16x4_t low[5], high[5];
+  Prepare5_16(sq, low, high);
+  SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
+  SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
+}
+
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  SumHorizontal(src, row3, row5);
+  SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+  const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
+  return vaddw_u8(sum3,
+                  (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+}
+
+inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum3 = Sum3_32(sum, sum, sum);
+  return vaddw_u16(sum3, src[1]);
+}
+
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum343W(low);
+  d.val[1] = Sum343W(high);
+  return d;
+}
+
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
+  const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
+  const uint16x8_t sum5 = vaddq_u16(sum4, sum);
+  return vaddw_u8(sum5,
+                  (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
+}
+
+inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
+  const uint32x4_t sum = Sum3W_32(src);
+  const uint32x4_t sum4 = vshlq_n_u32(sum, 2);
+  const uint32x4_t sum5 = vaddq_u32(sum4, sum);
+  return vaddw_u16(sum5, src[1]);
+}
+
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t d;
+  Prepare3_16(src, low, high);
+  d.val[0] = Sum565W(low);
+  d.val[1] = Sum565W(high);
+  return d;
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes = kOverreadInBytesPass1 - width;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = Load1MsanU8(src, overread_in_bytes);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row3, row5;
+      uint32x4x2_t row_sq3, row_sq5;
+      x -= 8;
+      src += 8;
+      s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+      sq[1] = SquareLo8(s[1]);
+      SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+      vst1q_u16(sum3, row3);
+      vst1q_u16(sum5, row5);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sum3 += 8;
+      sum5 += 8;
+      square_sum3 += 8;
+      square_sum5 += 8;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = Load1MsanU8(src, overread_in_bytes);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      uint16x8_t row;
+      uint32x4x2_t row_sq;
+      x -= 8;
+      src += 8;
+      s[1] = Load1MsanU8(src, sum_width - x + overread_in_bytes);
+      sq[1] = SquareLo8(s[1]);
+      if (size == 3) {
+        row = Sum3Horizontal(s);
+        row_sq = Sum3WHorizontal(sq);
+      } else {
+        row = Sum5Horizontal(s);
+        row_sq = Sum5WHorizontal(sq);
+      }
+      vst1q_u16(sums, row);
+      StoreAligned32U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sums += 8;
+      square_sums += 8;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
+inline void CalculateIntermediate(const uint16x8_t sum,
+                                  const uint32x4x2_t sum_sq,
+                                  const uint32_t scale, uint8x16_t* const ma,
+                                  uint16x8_t* const b) {
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+  const uint16x8_t z01 = vcombine_u16(z0, z1);
+  const uint8x8_t idx = vqmovn_u16(z01);
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, index);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+                      : vcombine_u8(vget_low_u8(*ma), val);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+  const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+  *b = vcombine_u16(b_lo, b_hi);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4x2_t sq5[5],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum5_16(s5);
+  const uint32x4x2_t sum_sq = Sum5_32(sq5);
+  CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4x2_t sq3[3],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum3_16(s3);
+  const uint32x4x2_t sum_sq = Sum3_32(sq3);
+  CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint16x8_t* const sum_ma444,
+                         uint32x4x2_t* const sum_b343,
+                         uint32x4x2_t* const sum_b444, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
+  *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
+  const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
+  *sum_ma343 = vaddw_u8(
+      sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
+  uint16x4_t low[3], high[3];
+  uint32x4x2_t sum_b111;
+  Prepare3_16(b3, low, high);
+  sum_b111.val[0] = Sum3W_32(low);
+  sum_b111.val[1] = Sum3W_32(high);
+  sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2);
+  sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2);
+  sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]);
+  sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]);
+  sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]);
+  sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
+  vst1q_u16(ma343 + x, *sum_ma343);
+  vst1q_u16(ma444 + x, *sum_ma444);
+  vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+  vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+  vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+  vst1q_u32(b444 + x + 4, sum_b444->val[1]);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16x8_t* const sum_ma343,
+                         uint32x4x2_t* const sum_b343, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma444;
+  uint32x4x2_t sum_b444;
+  Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+                       ma343, ma444, b343, b444);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
+                         const ptrdiff_t x, uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  uint16x8_t sum_ma343;
+  uint32x4x2_t sum_b343;
+  Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+                       b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    uint8x16_t s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[3] = Sum5Horizontal(s[0][0]);
+  s5[4] = Sum5Horizontal(s[1][0]);
+  sq5[3] = Sum5WHorizontal(sq[0]);
+  sq5[4] = Sum5WHorizontal(sq[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    uint8x16_t s[2][2], const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 1);
+  sq5[4] = Sum5WHorizontal(sq[1] + 1);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 2);
+  sq5[4] = Sum5WHorizontal(sq[1] + 2);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    uint8x16_t* const s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint16x8_t sq[2],
+    uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  sq[0] = SquareLo8(s[0]);
+  sq[1] = SquareHi8(s[0]);
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  sq[1] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    uint8x16_t* const s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16x8_t sq[2], uint8x16_t* const ma,
+    uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  sq[0] = SquareLo8(*s);
+  sq[1] = SquareHi8(*s);
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2], s3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    uint8x16_t s[2], const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[3],
+    uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s3[4];
+  uint32x4x2_t sq3[3];
+  sq[1] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2] + x, s3[2]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  sq3[2] = Sum3WHorizontal(sq + 1);
+  vst1q_u16(sum3[2] + x + 8, s3[3]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16(sum3, x + 8, s3 + 1);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    uint8x16_t s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+  CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8x16_t s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t ma3[2][2], uint16x8_t b3[2][3],
+    uint8x16_t ma5[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][4], s5[2][5];
+  uint32x4x2_t sq3[4], sq5[5];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[0][2]);
+  vst1q_u16(sum3[3] + x, s3[0][3]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+  CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+                            &b3[1][1]);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+  CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+                            &b3[1][2]);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    uint8x16_t* const s, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[2], uint8x16_t* const ma3, uint8x16_t* const ma5,
+    uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  sq[0] = SquareLo8(s[0]);
+  sq[1] = SquareHi8(s[0]);
+  SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    uint8x16_t s[2], const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2], uint16x8_t b3[2],
+    uint16x8_t b5[2]) {
+  uint16x8_t s3[2][3], s5[2][5];
+  uint32x4x2_t sq3[3], sq5[5];
+  sq[1] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq, &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+  sq[2] = SquareHi8(s[1]);
+  SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+  LoadAligned16x3U16(sum5, x + 8, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32(square_sum5, x + 8, sq5);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+  LoadAligned16x2U16(sum3, x + 8, s3[1]);
+  LoadAligned32x2U32(square_sum3, x + 8, sq3);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    uint16_t* ma565, uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  // TODO(b/194217060): Future msan load.
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[0] = Sum565<0>(masx);
+    b[0] = Sum565W(bs);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+    uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  s[0] = Load1QMsanU8(src, overread_in_bytes);
+  BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3];
+    s[1] = Load1QMsanU8(src + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    if (calculate444) {
+      Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+                      b444 + 8);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4x2_t b[2];
+      ma[0] = Sum343<0>(ma3x);
+      b[0] = Sum343W(bs);
+      vst1q_u16(ma343, ma[0]);
+      vst1q_u32(b343 + 0, b[0].val[0]);
+      vst1q_u32(b343 + 4, b[0].val[1]);
+      ma[1] = Sum343<8>(ma3x);
+      b[1] = Sum343W(bs + 1);
+      vst1q_u16(ma343 + 8, ma[1]);
+      vst1q_u32(b343 + 8, b[1].val[0]);
+      vst1q_u32(b343 + 12, b[1].val[1]);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  // TODO(b/194217060): Future msan load.
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t ma3x[3], ma5x[3];
+    uint32x4x2_t b[2];
+
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343<0>(ma3x);
+    ma[1] = Sum343<8>(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    b[0] = Sum343W(b3[0] + 0);
+    b[1] = Sum343W(b3[0] + 1);
+    StoreAligned64U32(b343[0] + x, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565<0>(ma5x);
+    ma[1] = Sum565<8>(ma5x);
+    StoreAligned32U16(ma565, ma);
+    b[0] = Sum565W(b5);
+    b[1] = Sum565W(b5 + 1);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    ma5[0] = ma5[1];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma,
+                              const uint32x4_t b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src));
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline int16x8_t CalculateFilteredOutput(const uint8x8_t src,
+                                         const uint16x8_t ma,
+                                         const uint32x4x2_t b) {
+  const uint16x8_t src_u16 = vmovl_u8(src);
+  const int16x4_t dst_lo =
+      FilterOutput<shift>(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]);
+  const int16x4_t dst_hi =
+      FilterOutput<shift>(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]);
+  return vcombine_s16(dst_lo, dst_hi);  // 13 bits
+}
+
+inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s,
+                                              uint16x8_t ma[2],
+                                              uint32x4x2_t b[2]) {
+  const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]);
+  uint32x4x2_t b_sum;
+  b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]);
+  b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s,
+                                              uint16x8_t ma[3],
+                                              uint32x4x2_t b[3]) {
+  const uint16x8_t ma_sum = Sum3_16(ma);
+  const uint32x4x2_t b_sum = Sum3_32(b);
+  return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
+}
+
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
+  const int16x4_t v_lo =
+      vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x4_t v_hi =
+      vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const int16x8_t vv = vcombine_s16(v_lo, v_hi);
+  const int16x8_t d =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+  return vqmovun_s16(d);
+}
+
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
+  v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
+  v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
+  return SelfGuidedFinal(src, v);
+}
+
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  int32x4_t v[2];
+  v[0] = vmull_n_s16(vget_low_s16(filter), w0);
+  v[1] = vmull_n_s16(vget_high_s16(filter), w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+    const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+    uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  s[0][0] = Load1QMsanU8(src0, overread_in_bytes);
+  s[1][0] = Load1QMsanU8(src1, overread_in_bytes);
+
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    int16x8_t p0, p1;
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess5(s, x + 8, scale, sum5, square_sum5, sq, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
+    b[1] = Sum565W(bs);
+    vst1q_u16(ma565[1] + x, ma[1]);
+    vst1q_u32(b565[1] + x + 0, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[1].val[1]);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+    const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+    const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+    const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+    const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+                                  const uint8_t* const src0, const int width,
+                                  const uint32_t scale, const int16_t w0,
+                                  uint16_t* const sum5[5],
+                                  uint32_t* const square_sum5[5],
+                                  uint16_t* ma565, uint32_t* b565,
+                                  uint8_t* const dst) {
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[4];
+  // TODO(b/194217060): Future msan load.
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    // TODO(b/194217060): Future msan load.
+    s[1] = vld1q_u8(src0 + x + 16);
+
+    BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
+                                bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
+    b[1] = Sum565W(bs);
+    ma[0] = vld1q_u16(ma565);
+    b[0].val[0] = vld1q_u32(b565 + 0);
+    b[0].val[1] = vld1q_u32(b565 + 4);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    bs[0] = bs[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + 8);
+    b[0].val[0] = vld1q_u32(b565 + 8);
+    b[0].val[1] = vld1q_u32(b565 + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+    uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+    uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  // TODO(b/194217060): Future msan load.
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3];
+    uint8x16_t ma3x[3];
+    uint32x4x2_t b[3];
+    s[1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess3(s, x + 8, scale, sum3, square_sum3, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                    b444[1]);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x);
+    ma[1] = vld1q_u16(ma444[0] + x);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+                    b343[2], b444[1]);
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444[3],
+    uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  // TODO(b/194217060): Future msan load.
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[3][3];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    uint32x4x2_t b[3][3];
+    int16x8_t p[2][2];
+    s[0][1] = Load1QMsanU8(src0 + x + 16, x + 16 + overread_in_bytes);
+    s[1][1] = Load1QMsanU8(src1 + x + 16, x + 16 + overread_in_bytes);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                    ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                    b343[3], b444[2]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0][1] = Sum565<0>(ma5x);
+    b[0][1] = Sum565W(b5);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    vst1q_u32(b565[1] + x, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+    p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+    p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+    const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+    const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+    Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+                    &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+                    ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565<8>(ma5x);
+    b[0][1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+    p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+    p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+    const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+    const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  uint8x16_t s[2], ma3[2], ma5[2];
+  uint16x8_t sq[4], ma[3], b3[3], b5[3];
+  uint32x4x2_t b[3];
+  // TODO(b/194217060): Future msan load.
+  s[0] = vld1q_u8(src0);
+
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], &b3[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+    // TODO(b/194217060): Future msan load.
+    s[1] = vld1q_u8(src0 + x + 16);
+
+    BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
+                               square_sum5, sq + 1, ma3, ma5, &b3[1], &b5[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565<0>(ma5x);
+    b[1] = Sum565W(b5);
+    Prepare3_8<0>(ma3, ma3x);
+    ma[2] = Sum343<0>(ma3x);
+    b[2] = Sum343W(b3);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x);
+    b[0].val[0] = vld1q_u32(b565 + x + 0);
+    b[0].val[1] = vld1q_u32(b565 + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    b[0].val[0] = vld1q_u32(b343 + x + 0);
+    b[0].val[1] = vld1q_u32(b343 + x + 4);
+    b[1].val[0] = vld1q_u32(b444 + x + 0);
+    b[1].val[1] = vld1q_u32(b444 + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+    ma[1] = Sum565<8>(ma5x);
+    b[1] = Sum565W(b5 + 1);
+    b5[0] = b5[2];
+    ma[2] = Sum343<8>(ma3x);
+    b[2] = Sum343W(b3 + 1);
+    b3[0] = b3[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    b[0].val[0] = vld1q_u32(b565 + x + 8);
+    b[0].val[1] = vld1q_u32(b565 + x + 12);
+    p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    b[0].val[0] = vld1q_u32(b343 + x + 8);
+    b[0].val[1] = vld1q_u32(b343 + x + 12);
+    b[1].val[0] = vld1q_u32(b444 + x + 8);
+    b[1].val[1] = vld1q_u32(b444 + x + 12);
+    p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
+                         b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+              ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+              dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+                     ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+                     dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+                          b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+                          scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+                                 nullptr, b343[0], nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+                                ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+                   ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
+// the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+
+#if LIBGAV1_MSAN
+  // Initialize to prevent msan warnings when intermediate overreads occur.
+  memset(sgr_buffer, 0, sizeof(SgrBuffer));
+#endif
+
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->loop_restorations[0] = WienerFilter_NEON;
+  dsp->loop_restorations[1] = SelfGuidedFilter_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h
new file mode 100644
index 0000000..b9a4803
--- /dev/null
+++ b/src/dsp/arm/loop_restoration_neon.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// This function is not thread-safe.
+void LoopRestorationInit_NEON();
+void LoopRestorationInit10bpp_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
new file mode 100644
index 0000000..853f949
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -0,0 +1,734 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
+// Compound predictors use int16_t values and need to multiply long because the
+// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
+// int8_t and accumulate into int32_t instruction.
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
+    const int16x4_t mask_val1 = vreinterpret_s16_u16(
+        vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
+    int16x8_t final_val;
+    if (subsampling_y == 1) {
+      const int16x4_t next_mask_val0 =
+          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
+      const int16x4_t next_mask_val1 =
+          vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
+      final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
+                            vcombine_s16(next_mask_val0, next_mask_val1));
+    } else {
+      final_val = vreinterpretq_s16_u16(
+          vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
+    }
+    return vrshrq_n_s16(final_val, subsampling_y + 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
+    if (subsampling_y == 1) {
+      const int16x8_t next_mask_val =
+          vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
+      mask_val = vaddq_s16(mask_val, next_mask_val);
+    }
+    return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val = vld1_u8(mask);
+  return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+                                  const int16_t* LIBGAV1_RESTRICT const pred_1,
+                                  const int16x8_t pred_mask_0,
+                                  const int16x8_t pred_mask_1,
+                                  uint8_t* LIBGAV1_RESTRICT dst,
+                                  const ptrdiff_t dst_stride) {
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const int32x4_t weighted_pred_0_lo =
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+  const int32x4_t weighted_pred_0_hi =
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+  const int32x4_t weighted_combo_hi =
+      vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                vget_high_s16(pred_val_1));
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //         (1 << kBitdepth8) - 1));
+  const uint8x8_t result =
+      vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                  vshrn_n_s32(weighted_combo_hi, 6)),
+                     4);
+  StoreLo4(dst, result);
+  StoreHi4(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 const ptrdiff_t mask_stride,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int16x8_t pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  // TODO(b/150461164): Arm tends to do better with load(val); val += stride
+  // It may be possible to turn this into a loop with a templated height.
+  pred_0 += 4 << 1;
+  pred_1 += 4 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int16x8_t pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           const ptrdiff_t /*prediction_stride_1*/,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* LIBGAV1_RESTRICT dest,
+                           const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  if (width == 4) {
+    MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const int16x8_t mask_inverter = vdupq_n_s16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+      const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
+      const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+      uint8x8_t result;
+      // int res = (mask_value * prediction_0[x] +
+      //      (64 - mask_value) * prediction_1[x]) >> 6;
+      const int32x4_t weighted_pred_0_lo =
+          vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+      const int32x4_t weighted_pred_0_hi =
+          vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+      const int32x4_t weighted_combo_lo =
+          vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
+                    vget_low_s16(pred_val_1));
+      const int32x4_t weighted_combo_hi =
+          vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                    vget_high_s16(pred_val_1));
+
+      // dst[x] = static_cast<Pixel>(
+      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+      //           (1 << kBitdepth8) - 1));
+      result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+                                           vshrn_n_s32(weighted_combo_hi, 6)),
+                              4);
+      vst1_u8(dst + x, result);
+
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
+// values) but regresses compound versions (input is int16_t). Try to
+// consolidate these.
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
+                                      ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x8_t mask_val =
+        vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
+    if (subsampling_y == 1) {
+      const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
+                                               vld1_u8(mask + mask_stride * 3));
+
+      // Use a saturating add to work around the case where all |mask| values
+      // are 64. Together with the rounding shift this ensures the correct
+      // result.
+      const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
+      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+    }
+
+    return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+  }
+
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  // TODO(b/150461164): Investigate the source of |mask| and see if the stride
+  // can be removed.
+  // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
+  return Load4<1>(mask + mask_stride, mask_val0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x16_t mask_val = vld1q_u8(mask);
+    const uint8x8_t mask_paired =
+        vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
+    if (subsampling_y == 1) {
+      const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
+      const uint8x8_t next_mask_paired =
+          vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
+
+      // Use a saturating add to work around the case where all |mask| values
+      // are 64. Together with the rounding shift this ensures the correct
+      // result.
+      const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
+      return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
+    }
+
+    return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
+  }
+
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  return vld1_u8(mask);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+    const uint8_t* LIBGAV1_RESTRICT const pred_0,
+    uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+    const uint8x8_t pred_mask_0, const uint8x8_t pred_mask_1) {
+  const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+  uint8x8_t pred_val_1 = Load4(pred_1);
+  pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1);
+
+  const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+  const uint16x8_t weighted_combo =
+      vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+  const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+  StoreLo4(pred_1, result);
+  StoreHi4(pred_1 + pred_stride_1, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+
+  pred_mask_1 =
+      GetInterIntraMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_NEON(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride, const int height) {
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_NEON<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend8bpp_NEON(
+    const uint8_t* LIBGAV1_RESTRICT prediction_0,
+    uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_NEON<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // TODO(b/150461164): Consider a 16 wide specialization (at least for the
+      // unsampled version) to take advantage of vld1q_u8().
+      const uint8x8_t pred_mask_1 =
+          GetInterIntraMask8<subsampling_x, subsampling_y>(
+              mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+      const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+      prediction_0 += 8;
+      const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
+      const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+      // weighted_pred0 + weighted_pred1
+      const uint16x8_t weighted_combo =
+          vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+      const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+      vst1_u8(prediction_1 + x, result);
+
+      x += 8;
+    } while (x < width);
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const uint8x8_t mask_val0 = vld1_u8(mask);
+    const uint8x8_t mask_val1 = vld1_u8(mask + (mask_stride << subsampling_y));
+    uint16x8_t final_val = vpaddlq_u8(vcombine_u8(mask_val0, mask_val1));
+    if (subsampling_y == 1) {
+      const uint8x8_t next_mask_val0 = vld1_u8(mask + mask_stride);
+      const uint8x8_t next_mask_val1 = vld1_u8(mask + mask_stride * 3);
+      final_val = vaddq_u16(
+          final_val, vpaddlq_u8(vcombine_u8(next_mask_val0, next_mask_val1)));
+    }
+    return vrshrq_n_u16(final_val, subsampling_y + 1);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val0 = Load4(mask);
+  const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
+  return vmovl_u8(mask_val);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    uint16x8_t mask_val = vpaddlq_u8(vld1q_u8(mask));
+    if (subsampling_y == 1) {
+      const uint16x8_t next_mask_val = vpaddlq_u8(vld1q_u8(mask + mask_stride));
+      mask_val = vaddq_u16(mask_val, next_mask_val);
+    }
+    return vrshrq_n_u16(mask_val, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const uint8x8_t mask_val = vld1_u8(mask);
+  return vmovl_u8(mask_val);
+}
+
+template <bool is_inter_intra>
+uint16x8_t SumWeightedPred(const uint16x8_t pred_mask_0,
+                           const uint16x8_t pred_mask_1,
+                           const uint16x8_t pred_val_0,
+                           const uint16x8_t pred_val_1) {
+  if (is_inter_intra) {
+    // dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+    //     mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+    uint16x8_t sum = vmulq_u16(pred_mask_1, pred_val_0);
+    sum = vmlaq_u16(sum, pred_mask_0, pred_val_1);
+    return vrshrq_n_u16(sum, 6);
+  } else {
+    // int res = (mask_value * prediction_0[x] +
+    //      (64 - mask_value) * prediction_1[x]) >> 6;
+    const uint32x4_t weighted_pred_0_lo =
+        vmull_u16(vget_low_u16(pred_mask_0), vget_low_u16(pred_val_0));
+    const uint32x4_t weighted_pred_0_hi = VMullHighU16(pred_mask_0, pred_val_0);
+    uint32x4x2_t sum;
+    sum.val[0] = vmlal_u16(weighted_pred_0_lo, vget_low_u16(pred_mask_1),
+                           vget_low_u16(pred_val_1));
+    sum.val[1] = VMlalHighU16(weighted_pred_0_hi, pred_mask_1, pred_val_1);
+    return vcombine_u16(vshrn_n_u32(sum.val[0], 6), vshrn_n_u32(sum.val[1], 6));
+  }
+}
+
+template <bool is_inter_intra, int width, int bitdepth = 10>
+inline void StoreShiftedResult(uint8_t* dst, const uint16x8_t result,
+                               const ptrdiff_t dst_stride = 0) {
+  if (is_inter_intra) {
+    if (width == 4) {
+      // Store 2 lines of width 4.
+      assert(dst_stride != 0);
+      vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(result));
+      vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+               vget_high_u16(result));
+    } else {
+      // Store 1 line of width 8.
+      vst1q_u16(reinterpret_cast<uint16_t*>(dst), result);
+    }
+  } else {
+    // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+    // dst[x] = static_cast<Pixel>(
+    //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+    //           (1 << kBitdepth8) - 1));
+    constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+    const uint16x8_t compound_result =
+        vminq_u16(vrshrq_n_u16(vqsubq_u16(result, vdupq_n_u16(kCompoundOffset)),
+                               inter_post_round_bits),
+                  vdupq_n_u16((1 << bitdepth) - 1));
+    if (width == 4) {
+      // Store 2 lines of width 4.
+      assert(dst_stride != 0);
+      vst1_u16(reinterpret_cast<uint16_t*>(dst), vget_low_u16(compound_result));
+      vst1_u16(reinterpret_cast<uint16_t*>(dst + dst_stride),
+               vget_high_u16(compound_result));
+    } else {
+      // Store 1 line of width 8.
+      vst1q_u16(reinterpret_cast<uint16_t*>(dst), compound_result);
+    }
+  }
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend4x2_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                              const uint16_t* LIBGAV1_RESTRICT pred_1,
+                              const ptrdiff_t pred_stride_1,
+                              const uint8_t* LIBGAV1_RESTRICT mask,
+                              const uint16x8_t mask_inverter,
+                              const ptrdiff_t mask_stride,
+                              uint8_t* LIBGAV1_RESTRICT dst,
+                              const ptrdiff_t dst_stride) {
+  // This works because stride == width == 4.
+  const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+  const uint16x8_t pred_val_1 =
+      is_inter_intra
+          ? vcombine_u16(vld1_u16(pred_1), vld1_u16(pred_1 + pred_stride_1))
+          : vld1q_u16(pred_1);
+  const uint16x8_t pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+  const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+      pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+  StoreShiftedResult<is_inter_intra, 4>(dst, weighted_pred_sum, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4x4_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                 const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                 const ptrdiff_t pred_stride_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 const ptrdiff_t mask_stride,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  // Double stride because the function works on 2 lines at a time.
+  const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+  const ptrdiff_t dst_stride_y = dst_stride << 1;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+
+  MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+      pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+      dst_stride);
+
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride_y;
+  dst += dst_stride_y;
+
+  MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+      pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+      dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlending4xH_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                 const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                 const ptrdiff_t pred_stride_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  // Double stride because the function works on 2 lines at a time.
+  const ptrdiff_t mask_stride_y = mask_stride << (subsampling_y + 1);
+  const ptrdiff_t dst_stride_y = dst_stride << 1;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  do {
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+
+    MaskBlend4x2_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, pred_stride_1, mask, mask_inverter, mask_stride, dst,
+        dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += pred_stride_1 << 1;
+    mask += mask_stride_y;
+    dst += dst_stride_y;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+void MaskBlend8_NEON(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                     const uint16_t* LIBGAV1_RESTRICT pred_1,
+                     const uint8_t* LIBGAV1_RESTRICT mask,
+                     const uint16x8_t mask_inverter,
+                     const ptrdiff_t mask_stride,
+                     uint8_t* LIBGAV1_RESTRICT dst) {
+  const uint16x8_t pred_val_0 = vld1q_u16(pred_0);
+  const uint16x8_t pred_val_1 = vld1q_u16(pred_1);
+  const uint16x8_t pred_mask_0 =
+      GetMask8<subsampling_x, subsampling_y>(mask, mask_stride);
+  const uint16x8_t pred_mask_1 = vsubq_u16(mask_inverter, pred_mask_0);
+  const uint16x8_t weighted_pred_sum = SumWeightedPred<is_inter_intra>(
+      pred_mask_0, pred_mask_1, pred_val_0, pred_val_1);
+
+  StoreShiftedResult<is_inter_intra, 8>(dst, weighted_pred_sum);
+}
+
+template <int subsampling_x, int subsampling_y, bool is_inter_intra>
+inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           const ptrdiff_t prediction_stride_1,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* LIBGAV1_RESTRICT dest,
+                           const ptrdiff_t dst_stride) {
+  if (!is_inter_intra) {
+    assert(prediction_stride_1 == width);
+  }
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  if (width == 4) {
+    MaskBlending4xH_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+        pred_0, pred_1, prediction_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const ptrdiff_t mask_stride_y = mask_stride << subsampling_y;
+  const uint8_t* mask = mask_ptr;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      MaskBlend8_NEON<subsampling_x, subsampling_y, is_inter_intra>(
+          pred_0 + x, pred_1 + x, mask + (x << subsampling_x), mask_inverter,
+          mask_stride,
+          reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(dst) + x));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += prediction_stride_1;
+    mask += mask_stride_y;
+  } while (++y < height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0, false>;
+  dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0, false>;
+  dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1, false>;
+
+  dsp->mask_blend[0][1] = MaskBlend_NEON<0, 0, true>;
+  dsp->mask_blend[1][1] = MaskBlend_NEON<1, 0, true>;
+  dsp->mask_blend[2][1] = MaskBlend_NEON<1, 1, true>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/mask_blend_neon.h b/src/dsp/arm/mask_blend_neon.h
new file mode 100644
index 0000000..c24f2f8
--- /dev/null
+++ b/src/dsp/arm/mask_blend_neon.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_
diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc
new file mode 100644
index 0000000..144adf7
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.cc
@@ -0,0 +1,378 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8_t LoadDivision(const int8x8x2_t division_table,
+                              const int8x8_t reference_offset) {
+  const int8x8_t kOne = vcreate_s8(0x0100010001000100);
+  const int8x16_t kOneQ = vcombine_s8(kOne, kOne);
+  const int8x8_t t = vadd_s8(reference_offset, reference_offset);
+  const int8x8x2_t tt = vzip_s8(t, t);
+  const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]);
+  const int8x16_t idx = vaddq_s8(t1, kOneQ);
+  const int8x8_t idx_low = vget_low_s8(idx);
+  const int8x8_t idx_high = vget_high_s8(idx);
+  const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low));
+  const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high));
+  return vcombine_s16(d0, d1);
+}
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_n_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x8_t MvProjectionClip(const int16x8_t mv,
+                                  const int16x8_t denominator,
+                                  const int numerator) {
+  const int16x4_t mv0 = vget_low_s16(mv);
+  const int16x4_t mv1 = vget_high_s16(mv);
+  const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator);
+  const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator);
+  const int16x8_t projection = vcombine_s16(s0, s1);
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const int16x8_t delta_sign = vshrq_n_s16(delta, 15);
+  const uint16x8_t delta_u = vreinterpretq_u16_s16(delta);
+  const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign);
+  const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10);
+  const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u);
+  const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6);
+  const int16x8_t offset1 = veorq_s16(offset0, dst_sign);
+  const int16x8_t offset2 = vsubq_s16(offset1, dst_sign);
+  return vqmovn_s16(offset2);
+}
+
+inline void GetPosition(
+    const int8x8x2_t division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const int8x8_t r_offsets, const int8x8_t source_reference_type8,
+    const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8,
+    const int16x8_t d_sign, const int delta, int8x8_t* const r,
+    int8x8_t* const position_y8, int8x8_t* const position_x8,
+    int64_t* const skip_64, int32x4_t mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = vtbl1_s8(r_offsets, source_reference_type8);
+  const int16x8_t denorm = LoadDivision(division_table, source_reference_type8);
+  int16x8_t projection_mv[2];
+  mvs[0] = vld1q_s32(mv_int + 0);
+  mvs[1] = vld1q_s32(mv_int + 4);
+  // Deinterlace x and y components
+  const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]);
+  const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]);
+  const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  *position_y8 = Project_NEON(projection_mv[0], d_sign);
+  const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign);
+  const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100});
+  *position_x8 = vqadd_s8(position_x, k01234567);
+  const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling = std::min(
+      x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset);  // [0, 16]
+  const int8x8_t x8_floor8 = vdup_n_s8(x8_floor);
+  const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling);
+  const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8);
+  const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8);
+  const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy);
+  const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy);
+  const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow));
+  const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out));
+  const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out));
+  *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0);
+}
+
+template <int idx>
+inline void Store(const int16x8_t position, const int8x8_t reference_offset,
+                  const int32x4_t mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset = vgetq_lane_s16(position, idx);
+  auto* const d_mv = reinterpret_cast<int32_t*>(&dst_mv[offset]);
+  vst1q_lane_s32(d_mv, mv, idx & 3);
+  vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const int16x8_t position,
+                       const int8x8_t reference_offset, const int32x4_t mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info,
+                                      const int reference_to_current_with_sign,
+                                      const int dst_sign, const int y8_start,
+                                      const int y8_end, const int x8_start,
+                                      const int x8_end,
+                                      TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const int16x8_t d_sign = vdupq_n_s16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const int8x8_t skip_reference =
+      vld1_s8(reinterpret_cast<const int8_t*>(skip_references));
+  const int8x8_t r_offsets = vld1_s8(reference_offsets);
+  const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions));
+  int8x8x2_t division_table;
+  division_table.val[0] = vget_low_s8(table);
+  division_table.val[1] = vget_high_s8(table);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                         // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);  // [1, 8]
+    const int8x8_t y8_floor8 = vdup_n_s8(y8_floor);
+    const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const int8x8_t source_reference_type8 =
+          vld1_s8(reinterpret_cast<const int8_t*>(source_reference_types + x8));
+      const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8);
+      const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      int8x8_t r, position_x8, position_y8;
+      int32x4_t mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8,
+                  &position_x8, &skip_64, mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const int16x8_t p_y = vmovl_s8(position_y8);
+      const int16x8_t p_x = vmovl_s8(position_x8);
+      const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+      const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const int8x8_t source_reference_type8 = vld1_s8(
+            reinterpret_cast<const int8_t*>(source_reference_types + x8));
+        const int8x8_t skip_r =
+            vtbl1_s8(skip_reference, source_reference_type8);
+        const int64_t early_skip =
+            vget_lane_s64(vreinterpret_s64_s8(skip_r), 0);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          int8x8_t r, position_x8, position_y8;
+          int32x4_t mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_y8, &position_x8, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const int16x8_t p_y = vmovl_s8(position_y8);
+            const int16x8_t p_x = vmovl_s8(position_x8);
+            const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride);
+            const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+}  // namespace
+
+void MotionFieldProjectionInit_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_field_projection_neon.h b/src/dsp/arm/motion_field_projection_neon.h
new file mode 100644
index 0000000..41ab6a6
--- /dev/null
+++ b/src/dsp/arm/motion_field_projection_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_
diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc
new file mode 100644
index 0000000..4720879
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.cc
@@ -0,0 +1,256 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator,
+                              const int32x4_t numerator) {
+  const int32x4_t m0 = vmull_s16(mv, denominator);
+  const int32x4_t m = vmulq_s32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const int32x4_t add_sign = vsraq_n_s32(m, m, 31);
+  return vqrshrn_n_s32(add_sign, 14);
+}
+
+inline int16x4_t MvProjectionCompound(const int16x4_t mv,
+                                      const int temporal_reference_offsets,
+                                      const int reference_offsets[2]) {
+  const int16x4_t denominator =
+      vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]);
+  const int32x2_t offset = vld1_s32(reference_offsets);
+  const int32x2x2_t offsets = vzip_s32(offset, offset);
+  const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]);
+  return MvProjection(mv, denominator, numerator);
+}
+
+inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) {
+  const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp);
+  const int16x8_t mv = vcombine_s16(mv0, mv1);
+  const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp);
+  return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp));
+}
+
+inline int16x8_t MvProjectionCompoundClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const int32x2_t temporal_mv = vld1_s32(tmvs);
+  const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0));
+  const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1));
+  const int16x4_t mv0 = MvProjectionCompound(
+      tmv0, temporal_reference_offsets[0], reference_offsets);
+  const int16x4_t mv1 = MvProjectionCompound(
+      tmv1, temporal_reference_offsets[1], reference_offsets);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline int16x8_t MvProjectionSingleClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, int16x4_t* const lookup) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const int16x8_t temporal_mv = vld1q_s16(tmvs);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2);
+  *lookup = vld1_lane_s16(
+      &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3);
+  const int16x4x2_t denominator = vzip_s16(*lookup, *lookup);
+  const int16x4_t tmv0 = vget_low_s16(temporal_mv);
+  const int16x4_t tmv1 = vget_high_s16(temporal_mv);
+  const int32x4_t numerator = vdupq_n_s32(reference_offset);
+  const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator);
+  const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator);
+  return ProjectionClip(mv0, mv1);
+}
+
+inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(1);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv1);
+}
+
+inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) {
+  const int16x8_t kRoundDownMask = vdupq_n_s16(7);
+  const uint16x8_t mvu = vreinterpretq_u16_s16(mv);
+  const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15));
+  const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3));
+  const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask);
+  vst1q_s16(static_cast<int16_t*>(candidate_mvs), mv2);
+}
+
+void MvProjectionCompoundLowPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundForceInteger_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionCompoundHighPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int loop_count = (count + 1) >> 1;
+  do {
+    const int16x8_t mv = MvProjectionCompoundClip(
+        temporal_mvs, temporal_reference_offsets, offsets);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 2;
+    temporal_reference_offsets += 2;
+    candidate_mvs += 2;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleLowPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    LowPrecision(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleForceInteger_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    ForceInteger(mv, candidate_mvs);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+void MvProjectionSingleHighPrecision_NEON(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int loop_count = (count + 3) >> 2;
+  int16x4_t lookup = vdup_n_s16(0);
+  do {
+    const int16x8_t mv = MvProjectionSingleClip(
+        temporal_mvs, temporal_reference_offsets, reference_offset, &lookup);
+    vst1q_s16(reinterpret_cast<int16_t*>(candidate_mvs), mv);
+    temporal_mvs += 4;
+    temporal_reference_offsets += 4;
+    candidate_mvs += 4;
+  } while (--loop_count != 0);
+}
+
+}  // namespace
+
+void MotionVectorSearchInit_NEON() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/motion_vector_search_neon.h b/src/dsp/arm/motion_vector_search_neon.h
new file mode 100644
index 0000000..19b4519
--- /dev/null
+++ b/src/dsp/arm/motion_vector_search_neon.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
new file mode 100644
index 0000000..659ed8e
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.cc
@@ -0,0 +1,940 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+#include "src/dsp/obmc.inc"
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
+                           const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+                           const uint8x8_t pred_mask,
+                           const uint8x8_t obmc_pred_mask) {
+  const uint8x8_t pred_val = Load4(pred);
+  const uint8x8_t obmc_pred_val = Load4(obmc_pred);
+  const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+  const uint8x8_t result =
+      vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  StoreLo4(pred, result);
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = Load2(kObmcMask);
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  uint8x8_t pred_val = vdup_n_u8(0);
+  uint8x8_t obmc_pred_val = vdup_n_u8(0);
+  int y = 0;
+  do {
+    pred_val = Load2<0>(pred, pred_val);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+    Store2<0>(pred, result);
+
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = Load4(kObmcMask + 2);
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+inline void OverlapBlendFromLeft8xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+  // 64 - mask
+  const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint8x8_t pred_val = vld1_u8(pred);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+    vst1_u8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  const uint8x16_t mask_inverter = vdupq_n_u8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const uint8x16_t pred_mask = vld1q_u8(mask + x);
+    // 64 - mask
+    const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
+    int y = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4x4_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride, const int height) {
+  uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  if (height == 2) {
+    return;
+  }
+
+  pred_mask = vdup_n_u8(kObmcMask[3]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vdup_n_u8(kObmcMask[4]);
+  obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+  WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  if (height < 8) {
+    OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
+                                obmc_prediction_stride, height);
+    return;
+  }
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  int y = 0;
+  // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+  // lines are unchanged as the corresponding mask value is 64.
+  do {
+    uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 1]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 2]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 3]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 4]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    pred_mask = vdup_n_u8(mask[y + 5]);
+    obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Increment for the right mask index.
+    y += 6;
+  } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    const uint8x8_t pred_val = vld1_u8(pred);
+    const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+    const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
+    const uint8x8_t result =
+        vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+
+    vst1_u8(pred, result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y != compute_height);
+}
+
+void OverlapBlendFromTop_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  if (width == 8) {
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const uint8x8_t mask_inverter = vdup_n_u8(64);
+  // Stop when mask value becomes 64. This is inferred for 4xH.
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+    // 64 - mask
+    const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
+    int x = 0;
+    do {
+      const uint8x16_t pred_val = vld1q_u8(pred + x);
+      const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
+      const uint16x8_t weighted_pred_lo =
+          vmull_u8(pred_mask, vget_low_u8(pred_val));
+      const uint8x8_t result_lo =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
+                                vget_low_u8(obmc_pred_val)),
+                       6);
+      const uint16x8_t weighted_pred_hi =
+          vmull_u8(pred_mask, vget_high_u8(pred_val));
+      const uint8x8_t result_hi =
+          vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
+                                vget_high_u8(obmc_pred_val)),
+                       6);
+      vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
+
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2. The value 64 leaves the result
+// equal to |pred| and may be ignored if convenient. Vector loads may overrread
+// values meant for larger sizes, but these values will be unused.
+constexpr uint16_t kObmcMask[62] = {
+    // Obmc Mask 2
+    45, 64,
+    // Obmc Mask 4
+    39, 50, 59, 64,
+    // Obmc Mask 8
+    36, 42, 48, 53, 57, 61, 64, 64,
+    // Obmc Mask 16
+    34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+    // Obmc Mask 32
+    33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+    59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
+
+inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred,
+                                const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+                                const uint16x4_t pred_mask,
+                                const uint16x4_t obmc_pred_mask) {
+  const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
+  const uint16x4_t obmc_pred_val =
+      vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
+  const uint16x4_t result =
+      vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  return result;
+}
+
+inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
+                             const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+                             const uint16x8_t pred_mask,
+                             const uint16x8_t obmc_pred_mask) {
+  const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
+  const uint16x8_t obmc_pred_val =
+      vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
+  const uint16x8_t result =
+      vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+  return result;
+}
+
+inline void OverlapBlendFromLeft2xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint16x4_t mask_inverter = vdup_n_u16(64);
+  // Second two lanes unused.
+  const uint16x4_t pred_mask = vld1_u16(kObmcMask);
+  const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint16x4_t result_0 =
+        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0);
+
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    const uint16x4_t result_1 =
+        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1);
+
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+inline void OverlapBlendFromLeft4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  const uint16x4_t mask_inverter = vdup_n_u16(64);
+  const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
+  // 64 - mask
+  const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
+  int y = 0;
+  do {
+    const uint16x4_t result_0 =
+        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    const uint16x4_t result_1 =
+        BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 2;
+  } while (y != height);
+}
+
+void OverlapBlendFromLeft_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                 obmc_prediction_stride);
+    return;
+  }
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x);
+    obmc_pred = reinterpret_cast<const uint8_t*>(
+        static_cast<const uint16_t*>(obmc_prediction) + x);
+    const uint16x8_t pred_mask = vld1q_u16(mask + x);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    int y = 0;
+    do {
+      const uint16x8_t result =
+          BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask);
+      vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <int lane>
+inline uint16x4_t BlendObmcFromTop4(
+    uint8_t* LIBGAV1_RESTRICT const pred,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
+    const uint16x8_t obmc_pred_mask) {
+  const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
+  const uint16x4_t obmc_pred_val =
+      vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
+  const uint16x4_t result = vrshr_n_u16(
+      VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+  return result;
+}
+
+template <int lane>
+inline uint16x8_t BlendObmcFromTop8(
+    uint8_t* LIBGAV1_RESTRICT const pred,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
+    const uint16x8_t obmc_pred_mask) {
+  const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
+  const uint16x8_t obmc_pred_val =
+      vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+  const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
+  const uint16x8_t result = vrshrq_n_u16(
+      VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
+  return result;
+}
+
+inline void OverlapBlendFromTop4x2Or4_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride, const int height) {
+  const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+  uint16x4_t result =
+      BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  if (height == 2) {
+    // Mask value is 64, meaning |pred| is unchanged.
+    return;
+  }
+
+  result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+}
+
+inline void OverlapBlendFromTop4xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride) {
+  if (height < 8) {
+    OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred,
+                                   obmc_prediction_stride, height);
+    return;
+  }
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  int y = 0;
+  // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
+  // lines are unchanged as the corresponding mask value is 64.
+  do {
+    const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    uint16x4_t result =
+        BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+    vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Increment for the right mask index.
+    y += 6;
+  } while (y < height - 4);
+}
+
+inline void OverlapBlendFromTop8xH_NEON(
+    uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+    const uint8_t* LIBGAV1_RESTRICT obmc_pred,
+    const ptrdiff_t obmc_prediction_stride, const int height) {
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  uint16x8_t pred_mask = vld1q_u16(mask);
+  uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+  uint16x8_t result =
+      BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  if (height == 2) return;
+
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  if (height == 4) return;
+
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+
+  if (height == 8) return;
+
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vld1q_u16(&mask[8]);
+  obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+  result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+
+  if (height == 16) return;
+
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  pred_mask = vld1q_u16(&mask[16]);
+  obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+
+  result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+  pred += prediction_stride;
+  obmc_pred += obmc_prediction_stride;
+
+  result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
+  vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+}
+
+void OverlapBlendFromTop_NEON(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
+                                obmc_prediction_stride);
+    return;
+  }
+
+  if (width == 8) {
+    OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred,
+                                obmc_prediction_stride, height);
+    return;
+  }
+
+  const uint16_t* mask = kObmcMask + height - 2;
+  const uint16x8_t mask_inverter = vdupq_n_u16(64);
+  const uint16x8_t pred_mask = vld1q_u16(mask);
+  // 64 - mask
+  const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+#define OBMC_ROW_FROM_TOP(n)                                                 \
+  do {                                                                       \
+    int x = 0;                                                               \
+    do {                                                                     \
+      const uint16x8_t result = BlendObmcFromTop8<n>(                        \
+          reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \
+          reinterpret_cast<const uint8_t*>(                                  \
+              reinterpret_cast<const uint16_t*>(obmc_pred) + x),             \
+          pred_mask, obmc_pred_mask);                                        \
+      vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result);              \
+                                                                             \
+      x += 8;                                                                \
+    } while (x < width);                                                     \
+  } while (false)
+
+  // Compute 1 row.
+  if (height == 2) {
+    OBMC_ROW_FROM_TOP(0);
+    return;
+  }
+
+  // Compute 3 rows.
+  if (height == 4) {
+    OBMC_ROW_FROM_TOP(0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(2);
+    return;
+  }
+
+  // Compute 6 rows.
+  if (height == 8) {
+    OBMC_ROW_FROM_TOP(0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(2);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(3);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(4);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(5);
+    return;
+  }
+
+  // Compute 12 rows.
+  if (height == 16) {
+    OBMC_ROW_FROM_TOP(0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(2);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(3);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(4);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(5);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(6);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(7);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    OBMC_ROW_FROM_TOP(0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(2);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(3);
+    return;
+  }
+
+  // Stop when mask value becomes 64. This is a multiple of 8 for height 32
+  // and 64.
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
+    // 64 - mask
+    const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
+    OBMC_ROW_FROM_TOP(0);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(1);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(2);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(3);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(4);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(5);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(6);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    OBMC_ROW_FROM_TOP(7);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    y += 8;
+  } while (y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/obmc_neon.h b/src/dsp/arm/obmc_neon.h
new file mode 100644
index 0000000..788017e
--- /dev/null
+++ b/src/dsp/arm/obmc_neon.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If NEON is enabled, signal the NEON implementation should be used.
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
new file mode 100644
index 0000000..2f8dde6
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.cc
@@ -0,0 +1,318 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint8x8_t filter[8];
+    uint8x16_t d[kSuperResFilterTaps / 2];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      filter[i] =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+    }
+    Transpose8x8(filter, d);
+    vst1q_u8(dst, d[0]);
+    dst += 16;
+    vst1q_u8(dst, d[1]);
+    dst += 16;
+    vst1q_u8(dst, d[2]);
+    dst += 16;
+    vst1q_u8(dst, d[3]);
+    dst += 16;
+  } while (--x != 0);
+}
+
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+                          const uint8_t** coefficients) {
+  uint8x16_t f[kSuperResFilterTaps / 2];
+  for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+    f[i] = vld1q_u8(*coefficients);
+  }
+  uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+  res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+  res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+  res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+  uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+  temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+  temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+  temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+  res = vqsubq_u16(res, temp);
+  return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+    // Initialize the padding area to prevent msan warnings.
+    const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+    const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, super_res_right_border);
+    int subpixel_x = initial_subpixel_x;
+    uint8x8_t sr[8];
+    uint8x16_t s[8];
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+        s[i] = vcombine_u8(sr[i], s_hi);
+      }
+      Transpose8x16(s);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_low_u8(s[0]);
+      sr[1] = vget_low_u8(s[1]);
+      sr[2] = vget_low_u8(s[2]);
+      sr[3] = vget_low_u8(s[3]);
+      sr[4] = vget_low_u8(s[4]);
+      sr[5] = vget_low_u8(s[5]);
+      sr[6] = vget_low_u8(s[6]);
+      sr[7] = vget_low_u8(s[7]);
+      const uint8x8_t d0 = SuperRes(sr, &filter);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_high_u8(s[0]);
+      sr[1] = vget_high_u8(s[1]);
+      sr[2] = vget_high_u8(s[2]);
+      sr[3] = vget_high_u8(s[3]);
+      sr[4] = vget_high_u8(s[4]);
+      sr[5] = vget_high_u8(s[5]);
+      sr[6] = vget_high_u8(s[6]);
+      sr[7] = vget_high_u8(s[7]);
+      const uint8x8_t d1 = SuperRes(sr, &filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint16x8_t filter[8];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      const uint8x8_t filter_8 =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+      // uint8_t -> uint16_t
+      filter[i] = vmovl_u8(filter_8);
+    }
+
+    Transpose8x8(filter);
+
+    vst1q_u16(dst, filter[0]);
+    dst += 8;
+    vst1q_u16(dst, filter[1]);
+    dst += 8;
+    vst1q_u16(dst, filter[2]);
+    dst += 8;
+    vst1q_u16(dst, filter[3]);
+    dst += 8;
+    vst1q_u16(dst, filter[4]);
+    dst += 8;
+    vst1q_u16(dst, filter[5]);
+    dst += 8;
+    vst1q_u16(dst, filter[6]);
+    dst += 8;
+    vst1q_u16(dst, filter[7]);
+    dst += 8;
+  } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+                           const uint16_t** coefficients, int bitdepth) {
+  uint16x8_t f[kSuperResFilterTaps];
+  for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+    f[i] = vld1q_u16(*coefficients);
+  }
+
+  uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+  uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+  res_lo = vqsubq_u32(res_lo, temp_lo);
+
+  uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+
+  uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+  res_hi = vqsubq_u32(res_hi, temp_hi);
+
+  const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+                                      vqrshrn_n_u32(res_hi, kFilterBits));
+
+  // Clip the result at (1 << bd) - 1.
+  return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* LIBGAV1_RESTRICT const coefficients,
+                   void* LIBGAV1_RESTRICT const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* LIBGAV1_RESTRICT const dest,
+                   const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+#if LIBGAV1_MSAN
+    // Initialize the padding area to prevent msan warnings.
+    const int super_res_right_border = kSuperResHorizontalPadding;
+#else
+    const int super_res_right_border = kSuperResHorizontalBorder;
+#endif
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, super_res_right_border);
+    int subpixel_x = initial_subpixel_x;
+    uint16x8_t sr[8];
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+
+      Transpose8x8(sr);
+
+      const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+      vst1q_u16(dst_ptr, d0);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON<10>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
new file mode 100644
index 0000000..65e48c5
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
new file mode 100644
index 0000000..71e0a43
--- /dev/null
+++ b/src/dsp/arm/warp_neon.cc
@@ -0,0 +1,906 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+//
+// src_row_centered contains 16 "centered" samples of a source row. (We center
+// the samples by subtracting 128 from the samples.)
+void HorizontalFilter(const int sx4, const int16_t alpha,
+                      const int8x16_t src_row_centered,
+                      int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  int8x8_t filter[8];
+  for (auto& f : filter) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = vld1_s8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8(filter);
+  // Add kFirstPassOffset to ensure |sum| stays within uint16_t.
+  // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the
+  // centering of the source samples. These combined are 1 << 15 or -32768.
+  int16x8_t sum =
+      vdupq_n_s16(static_cast<int16_t>(kFirstPassOffset + 128 * 128));
+  // Unrolled k = 0..7 loop. We need to manually unroll the loop because the
+  // third argument (an index value) to vextq_s8() must be a constant
+  // (immediate). src_row_window is a sliding window of length 8 into
+  // src_row_centered.
+  // k = 0.
+  int8x8_t src_row_window = vget_low_s8(src_row_centered);
+  sum = vmlal_s8(sum, filter[0], src_row_window);
+  // k = 1.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1));
+  sum = vmlal_s8(sum, filter[1], src_row_window);
+  // k = 2.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2));
+  sum = vmlal_s8(sum, filter[2], src_row_window);
+  // k = 3.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3));
+  sum = vmlal_s8(sum, filter[3], src_row_window);
+  // k = 4.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4));
+  sum = vmlal_s8(sum, filter[4], src_row_window);
+  // k = 5.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5));
+  sum = vmlal_s8(sum, filter[5], src_row_window);
+  // k = 6.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6));
+  sum = vmlal_s8(sum, filter[6], src_row_window);
+  // k = 7.
+  src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7));
+  sum = vmlal_s8(sum, filter[7], src_row_window);
+  // End of unrolled k = 0..7 loop.
+  // Due to the offset |sum| is guaranteed to be unsigned.
+  uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum);
+  sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal);
+  // After the shift |sum_unsigned| will fit into int16_t.
+  vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+               const ptrdiff_t source_stride, const int source_width,
+               const int source_height,
+               const int* LIBGAV1_RESTRICT const warp_params,
+               const int subsampling_x, const int subsampling_y,
+               const int block_start_x, const int block_start_y,
+               const int block_width, const int block_height,
+               const int16_t alpha, const int16_t beta, const int16_t gamma,
+               const int16_t delta, void* LIBGAV1_RESTRICT dest,
+               const ptrdiff_t dest_stride) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can multiply it by kWarpedFilters (which has signed
+    // values) using vmlal_s16().
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block.
+  int start_y = block_start_y;
+  do {
+    int start_x = block_start_x;
+    do {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const int dst_x =
+          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+      const int dst_y =
+          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+      const int x4 = dst_x >> subsampling_x;
+      const int y4 = dst_y >> subsampling_y;
+      const int ix4 = x4 >> kWarpedModelPrecisionBits;
+      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const uint8_t* first_row_border =
+            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const uint8_t row_border_pixel =
+              first_row_border[row * source_stride];
+
+          DestType* dst_row = dst + start_x - block_start_x;
+          for (int y = 0; y < 8; ++y) {
+            if (is_compound) {
+              const int16x8_t sum =
+                  vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+                                                   kRoundBitsVertical));
+              vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+            } else {
+              memset(dst_row, row_border_pixel, 8);
+            }
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| do-while loop.
+          start_x += 8;
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 =
+            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+          const int16x8_t intermediate =
+              vld1q_s16(&intermediate_result_column[y]);
+          int16_t tmp[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+            const int32x4_t product_low =
+                vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+            const int32x4_t product_high =
+                vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+            // vaddvq_s32 is only available on __aarch64__.
+            const int32_t sum =
+                vaddvq_s32(product_low) + vaddvq_s32(product_high);
+            const int16_t sum_descale =
+                RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              dst_row[x] = sum_descale;
+            } else {
+              tmp[x] = sum_descale;
+            }
+            sy += gamma;
+          }
+          if (!is_compound) {
+            const int16x8_t sum = vld1q_s16(tmp);
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#else   // !defined(__aarch64__)
+          int16x8_t filter[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            filter[x] = vld1q_s16(kWarpedFilters[offset]);
+            sy += gamma;
+          }
+          Transpose8x8(filter);
+          int32x4_t sum_low = vdupq_n_s32(0);
+          int32x4_t sum_high = sum_low;
+          for (int k = 0; k < 8; ++k) {
+            const int16_t intermediate = intermediate_result_column[y + k];
+            sum_low =
+                vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+            sum_high =
+                vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+          }
+          const int16x8_t sum =
+              vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                           vrshrn_n_s32(sum_high, kRoundBitsVertical));
+          if (is_compound) {
+            vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+          } else {
+            vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+          }
+#endif  // defined(__aarch64__)
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| do-while loop.
+        start_x += 8;
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const uint8_t* const src_row = src + row * source_stride;
+        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+        // read but is ignored.
+        //
+        // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+        // bytes after src_row[source_width - 1]. We assume the source frame
+        // has left and right borders of at least 13 bytes that extend the
+        // frame boundary pixels. We also assume there is at least one extra
+        // padding byte after the right border of the last source row.
+        const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+        // Convert src_row_v to int8 (subtract 128).
+        const int8x16_t src_row_centered =
+            vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          const uint8_t* const src_row = src + row * source_stride;
+          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+          // read but is ignored.
+          //
+          // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+          // bytes after src_row[source_width - 1]. We assume the source frame
+          // has left and right borders of at least 13 bytes that extend the
+          // frame boundary pixels. We also assume there is at least one extra
+          // padding byte after the right border of the last source row.
+          const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+          // Convert src_row_v to int8 (subtract 128).
+          const int8x16_t src_row_centered =
+              vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
+          HorizontalFilter(sx4, alpha, src_row_centered,
+                           intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 =
+          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        int16x8_t filter[8];
+        for (auto& f : filter) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          f = vld1q_s16(kWarpedFilters[offset]);
+          sy += gamma;
+        }
+        Transpose8x8(filter);
+        int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval);
+        int32x4_t sum_high = sum_low;
+        for (int k = 0; k < 8; ++k) {
+          const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+          sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+                              vget_low_s16(intermediate));
+          sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+                               vget_high_s16(intermediate));
+        }
+        const int16x8_t sum =
+            vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                         vrshrn_n_s32(sum_high, kRoundBitsVertical));
+        if (is_compound) {
+          vst1q_s16(reinterpret_cast<int16_t*>(dst_row), sum);
+        } else {
+          vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+      start_x += 8;
+    } while (start_x < block_start_x + block_width);
+    dst += 8 * dest_stride;
+    start_y += 8;
+  } while (start_y < block_start_y + block_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_NEON</*is_compound=*/false>;
+  dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+LIBGAV1_ALWAYS_INLINE uint16x8x2_t LoadSrcRow(uint16_t const* ptr) {
+  uint16x8x2_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  return x;
+}
+
+LIBGAV1_ALWAYS_INLINE void HorizontalFilter(
+    const int sx4, const int16_t alpha, const uint16x8x2_t src_row,
+    int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  int8x8_t filter8[8];
+  for (auto& f : filter8) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = vld1_s8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+
+  Transpose8x8(filter8);
+
+  int16x8_t filter[8];
+  for (int i = 0; i < 8; ++i) {
+    filter[i] = vmovl_s8(filter8[i]);
+  }
+
+  int32x4x2_t sum;
+  int16x8_t src_row_window;
+  // k = 0.
+  src_row_window = vreinterpretq_s16_u16(src_row.val[0]);
+  sum.val[0] = vmull_s16(vget_low_s16(filter[0]), vget_low_s16(src_row_window));
+  sum.val[1] = VMullHighS16(filter[0], src_row_window);
+  // k = 1.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 1));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[1]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[1], src_row_window);
+  // k = 2.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 2));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[2]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[2], src_row_window);
+  // k = 3.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 3));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[3]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[3], src_row_window);
+  // k = 4.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 4));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[4]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[4], src_row_window);
+  // k = 5.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 5));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[5]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[5], src_row_window);
+  // k = 6.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 6));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[6]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[6], src_row_window);
+  // k = 7.
+  src_row_window =
+      vreinterpretq_s16_u16(vextq_u16(src_row.val[0], src_row.val[1], 7));
+  sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(filter[7]),
+                         vget_low_s16(src_row_window));
+  sum.val[1] = VMlalHighS16(sum.val[1], filter[7], src_row_window);
+  // End of unrolled k = 0..7 loop.
+
+  vst1_s16(intermediate_result_row,
+           vrshrn_n_s32(sum.val[0], kInterRoundBitsHorizontal));
+  vst1_s16(intermediate_result_row + 4,
+           vrshrn_n_s32(sum.val[1], kInterRoundBitsHorizontal));
+}
+
+template <bool is_compound>
+void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
+               const ptrdiff_t source_stride, const int source_width,
+               const int source_height,
+               const int* LIBGAV1_RESTRICT const warp_params,
+               const int subsampling_x, const int subsampling_y,
+               const int block_start_x, const int block_start_y,
+               const int block_width, const int block_height,
+               const int16_t alpha, const int16_t beta, const int16_t gamma,
+               const int16_t delta, void* LIBGAV1_RESTRICT dest,
+               const ptrdiff_t dest_stride) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can multiply it by kWarpedFilters (which has signed
+    // values) using vmlal_s16().
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = source_stride >> 1;
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint16_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+  const ptrdiff_t dst_stride = is_compound ? dest_stride : dest_stride >> 1;
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block.
+  int start_y = block_start_y;
+  do {
+    int start_x = block_start_x;
+    do {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const int dst_x =
+          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+      const int dst_y =
+          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+      const int x4 = dst_x >> subsampling_x;
+      const int y4 = dst_y >> subsampling_y;
+      const int ix4 = x4 >> kWarpedModelPrecisionBits;
+      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const uint16_t* first_row_border =
+            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const uint16_t row_border_pixel = first_row_border[row * src_stride];
+
+          DestType* dst_row = dst + start_x - block_start_x;
+          for (int y = 0; y < 8; ++y) {
+            if (is_compound) {
+              const int16x8_t sum =
+                  vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical -
+                                                   kRoundBitsVertical));
+              vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                        vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+            } else {
+              vst1q_u16(reinterpret_cast<uint16_t*>(dst_row),
+                        vdupq_n_u16(row_border_pixel));
+            }
+            dst_row += dst_stride;
+          }
+          // End of region 1. Continue the |start_x| do-while loop.
+          start_x += 8;
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          int sum = first_row_border[row * src_stride];
+          sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 =
+            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+#if defined(__aarch64__)
+          const int16x8_t intermediate =
+              vld1q_s16(&intermediate_result_column[y]);
+          int16_t tmp[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]);
+            const int32x4_t product_low =
+                vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate));
+            const int32x4_t product_high =
+                vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate));
+            // vaddvq_s32 is only available on __aarch64__.
+            const int32_t sum =
+                vaddvq_s32(product_low) + vaddvq_s32(product_high);
+            const int16_t sum_descale =
+                RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              dst_row[x] = sum_descale + kCompoundOffset;
+            } else {
+              tmp[x] = sum_descale;
+            }
+            sy += gamma;
+          }
+          if (!is_compound) {
+            const uint16x8_t v_max_bitdepth =
+                vdupq_n_u16((1 << kBitdepth10) - 1);
+            const int16x8_t sum = vld1q_s16(tmp);
+            const uint16x8_t d0 =
+                vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum, vdupq_n_s16(0))),
+                          v_max_bitdepth);
+            vst1q_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+          }
+#else   // !defined(__aarch64__)
+          int16x8_t filter[8];
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            filter[x] = vld1q_s16(kWarpedFilters[offset]);
+            sy += gamma;
+          }
+          Transpose8x8(filter);
+          int32x4_t sum_low = vdupq_n_s32(0);
+          int32x4_t sum_high = sum_low;
+          for (int k = 0; k < 8; ++k) {
+            const int16_t intermediate = intermediate_result_column[y + k];
+            sum_low =
+                vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate);
+            sum_high =
+                vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate);
+          }
+          if (is_compound) {
+            const int16x8_t sum =
+                vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                             vrshrn_n_s32(sum_high, kRoundBitsVertical));
+            vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                      vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+          } else {
+            const uint16x4_t v_max_bitdepth =
+                vdup_n_u16((1 << kBitdepth10) - 1);
+            const uint16x4_t d0 = vmin_u16(
+                vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+            const uint16x4_t d1 = vmin_u16(
+                vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+            vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+            vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+          }
+#endif  // defined(__aarch64__)
+          dst_row += dst_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| do-while loop.
+        start_x += 8;
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const uint16_t* const src_row = src + row * src_stride;
+        // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+        // read but is ignored.
+        //
+        // NOTE: This may read up to 13 pixels before src_row[0] or up to 14
+        // pixels after src_row[source_width - 1]. We assume the source frame
+        // has left and right borders of at least 13 pixels that extend the
+        // frame boundary pixels. We also assume there is at least one extra
+        // padding pixel after the right border of the last source row.
+        const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved in
+          // warp.cc.
+          const int row = iy4 + y;
+          const uint16_t* const src_row = src + row * src_stride;
+          // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+          // read but is ignored.
+          //
+          // NOTE: This may read up to pixels bytes before src_row[0] or up to
+          // 14 pixels after src_row[source_width - 1]. We assume the source
+          // frame has left and right borders of at least 13 pixels that extend
+          // the frame boundary pixels. We also assume there is at least one
+          // extra padding pixel after the right border of the last source row.
+          const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
+          HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 =
+          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        int16x8_t filter[8];
+        for (auto& f : filter) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          f = vld1q_s16(kWarpedFilters[offset]);
+          sy += gamma;
+        }
+        Transpose8x8(filter);
+        int32x4_t sum_low = vdupq_n_s32(0);
+        int32x4_t sum_high = sum_low;
+        for (int k = 0; k < 8; ++k) {
+          const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]);
+          sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]),
+                              vget_low_s16(intermediate));
+          sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]),
+                               vget_high_s16(intermediate));
+        }
+        if (is_compound) {
+          const int16x8_t sum =
+              vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical),
+                           vrshrn_n_s32(sum_high, kRoundBitsVertical));
+          vst1q_s16(reinterpret_cast<int16_t*>(dst_row),
+                    vaddq_s16(sum, vdupq_n_s16(kCompoundOffset)));
+        } else {
+          const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+          const uint16x4_t d0 = vmin_u16(
+              vqrshrun_n_s32(sum_low, kRoundBitsVertical), v_max_bitdepth);
+          const uint16x4_t d1 = vmin_u16(
+              vqrshrun_n_s32(sum_high, kRoundBitsVertical), v_max_bitdepth);
+          vst1_u16(reinterpret_cast<uint16_t*>(dst_row), d0);
+          vst1_u16(reinterpret_cast<uint16_t*>(dst_row + 4), d1);
+        }
+        dst_row += dst_stride;
+        sy4 += delta;
+      }
+      start_x += 8;
+    } while (start_x < block_start_x + block_width);
+    dst += 8 * dst_stride;
+    start_y += 8;
+  } while (start_y < block_start_y + block_height);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_NEON</*is_compound=*/false>;
+  dsp->warp_compound = Warp_NEON</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WarpInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/warp_neon.h b/src/dsp/arm/warp_neon.h
new file mode 100644
index 0000000..cd60602
--- /dev/null
+++ b/src/dsp/arm/warp_neon.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_Warp LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WarpCompound LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_
diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc
new file mode 100644
index 0000000..5ad6b97
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.cc
@@ -0,0 +1,588 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/weight_mask_neon.h"
+
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline int16x8x2_t LoadPred(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                            const int16_t* LIBGAV1_RESTRICT prediction_1) {
+  const int16x8x2_t pred = {vld1q_s16(prediction_0), vld1q_s16(prediction_1)};
+  return pred;
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline uint16x8x2_t LoadPred(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                             const uint16_t* LIBGAV1_RESTRICT prediction_1) {
+  const uint16x8x2_t pred = {vld1q_u16(prediction_0), vld1q_u16(prediction_1)};
+  return pred;
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const int16x8x2_t pred) {
+  static_assert(bitdepth == 8, "");
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  return vrshrq_n_u16(
+      vreinterpretq_u16_s16(vabdq_s16(pred.val[0], pred.val[1])),
+      rounding_bits);
+}
+
+template <int bitdepth>
+inline uint16x8_t AbsolutePredDifference(const uint16x8x2_t pred) {
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  return vrshrq_n_u16(vabdq_u16(pred.val[0], pred.val[1]), rounding_bits);
+}
+
+template <bool mask_is_inverse, int bitdepth>
+inline void WeightMask8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT mask) {
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  using PredTypeVecx2 =
+      typename std::conditional<bitdepth == 8, int16x8x2_t, uint16x8x2_t>::type;
+  const PredTypeVecx2 pred =
+      LoadPred(static_cast<const PredType*>(prediction_0),
+               static_cast<const PredType*>(prediction_1));
+  const uint16x8_t difference = AbsolutePredDifference<bitdepth>(pred);
+  const uint8x8_t difference_offset = vdup_n_u8(38);
+  const uint8x8_t mask_ceiling = vdup_n_u8(64);
+  const uint8x8_t adjusted_difference =
+      vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset);
+  const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value);
+    vst1_u8(mask, inverted_mask_value);
+  } else {
+    vst1_u8(mask, mask_value);
+  }
+}
+
+#define WEIGHT8_WITHOUT_STRIDE \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask)
+
+#define WEIGHT8_AND_STRIDE \
+  WEIGHT8_WITHOUT_STRIDE;  \
+  pred_0 += 8;             \
+  pred_1 += 8;             \
+  mask += mask_stride
+
+// |pred_0| and |pred_1| are cast as int16_t* for the sake of pointer math. They
+// are uint16_t* for 10bpp and 12bpp, and this is handled in WeightMask8_NEON.
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                        const void* LIBGAV1_RESTRICT prediction_1,
+                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask8x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+    WEIGHT8_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT8_AND_STRIDE;
+  WEIGHT8_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE                                      \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask); \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8, mask + 8)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (++y < 7);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask16x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                         \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask);    \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8,   \
+                                              mask + 8);                \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+                                              mask + 16);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+                                              mask + 24)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x8_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask32x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                         \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0, pred_1, mask);    \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 8, pred_1 + 8,   \
+                                              mask + 8);                \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 16, pred_1 + 16, \
+                                              mask + 16);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 24, pred_1 + 24, \
+                                              mask + 24);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 32, pred_1 + 32, \
+                                              mask + 32);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 40, pred_1 + 40, \
+                                              mask + 40);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 48, pred_1 + 48, \
+                                              mask + 48);               \
+  WeightMask8_NEON<mask_is_inverse, bitdepth>(pred_0 + 56, pred_1 + 56, \
+                                              mask + 56)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x16_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x32_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask64x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x64_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse, int bitdepth>
+void WeightMask128x128_NEON(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+#undef WEIGHT8_WITHOUT_STRIDE
+#undef WEIGHT8_AND_STRIDE
+#undef WEIGHT16_WITHOUT_STRIDE
+#undef WEIGHT16_AND_STRIDE
+#undef WEIGHT32_WITHOUT_STRIDE
+#undef WEIGHT32_AND_STRIDE
+#undef WEIGHT64_WITHOUT_STRIDE
+#undef WEIGHT64_AND_STRIDE
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_NEON<0, 8>;               \
+  dsp->weight_mask[w_index][h_index][1] =                      \
+      WeightMask##width##x##height##_NEON<1, 8>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_8BPP
+
+}  // namespace
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_NEON<0, 10>;               \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_NEON<1, 10>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+#undef INIT_WEIGHT_MASK_10BPP
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+void WeightMaskInit_NEON() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/weight_mask_neon.h b/src/dsp/arm/weight_mask_neon.h
new file mode 100644
index 0000000..573f7de
--- /dev/null
+++ b/src/dsp/arm/weight_mask_neon.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_
diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc
new file mode 100644
index 0000000..273b355
--- /dev/null
+++ b/src/dsp/average_blend.cc
@@ -0,0 +1,100 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void AverageBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                    const void* LIBGAV1_RESTRICT prediction_1, const int width,
+                    const int height, void* const dest,
+                    const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      int res = pred_0[x] + pred_1[x];
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+  dsp->average_blend = AverageBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void AverageBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/average_blend.h b/src/dsp/average_blend.h
new file mode 100644
index 0000000..02ecd09
--- /dev/null
+++ b/src/dsp/average_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/average_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/average_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_
diff --git a/src/dsp/average_blend_test.cc b/src/dsp/average_blend_test.cc
new file mode 100644
index 0000000..04e24e5
--- /dev/null
+++ b/src/dsp/average_blend_test.cc
@@ -0,0 +1,292 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e8;
+constexpr char kAverageBlend[] = "AverageBlend";
+// average_blend is applied to compound prediction values. This implies a range
+// far exceeding that of pixel values.
+// The ranges include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+template <int bitdepth, typename Pixel>
+class AverageBlendTest : public testing::TestWithParam<BlockSize>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  AverageBlendTest() = default;
+  ~AverageBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    AverageBlendInit_C();
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->average_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        AverageBlendInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      AverageBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->average_blend;
+    dist_blend_func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests, bool debug);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::AverageBlendFunc base_func_;
+  dsp::AverageBlendFunc func_;
+  dsp::DistanceWeightedBlendFunc dist_blend_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void AverageBlendTest<bitdepth, Pixel>::Test(const char* digest, int num_tests,
+                                             bool debug) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, width_, height_, dest_,
+          sizeof(dest_[0]) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+  if (debug) {
+    if (base_func_ != nullptr) {
+      base_func_(source1_, source2_, width_, height_, reference_,
+                 sizeof(reference_[0]) * kDestStride);
+    } else {
+      // Use dist_blend_func_ as the base for C tests.
+      const int8_t weight = 8;
+      dist_blend_func_(source1_, source2_, weight, weight, width_, height_,
+                       reference_, sizeof(reference_[0]) * kDestStride);
+    }
+    EXPECT_TRUE(test_utils::CompareBlocks(dest_, reference_, width_, height_,
+                                          kDestStride, kDestStride, false));
+  }
+
+  test_utils::CheckMd5Digest(kAverageBlend, ToString(GetParam()), digest, dest_,
+                             sizeof(dest_[0]) * kDestStride * height_,
+                             elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
+};
+
+using AverageBlendTest8bpp = AverageBlendTest<8, uint8_t>;
+
+const char* GetAverageBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "152bcc35946900b1ed16369b3e7a81b7",
+      "c23e9b5698f7384eaae30a3908118b77",
+      "f2da31d940f62490c368c03d32d3ede8",
+      // 8xN
+      "73c95485ef956e1d9ab914e88e6a202b",
+      "d90d3abd368e58c513070a88b34649ba",
+      "77f7d53d0edeffb3537afffd9ff33a4a",
+      "460b9b1e6b83f65f013cfcaf67ec0122",
+      // 16xN
+      "96454a56de940174ff92e9bb686d6d38",
+      "a50e268e93b48ae39cc2a47d377410e2",
+      "65c8502ff6d78065d466f9911ed6bb3e",
+      "bc2c873b9f5d74b396e1df705e87f699",
+      "b4dae656484b2d255d1e09b7f34e12c1",
+      // 32xN
+      "7e1e5db92b22a96e5226a23de883d766",
+      "ca40d46d89773e7f858b15fcecd43cc0",
+      "bfdc894707323f4dc43d1326309f8368",
+      "f4733417621719b7feba3166ec0da5b9",
+      // 64xN
+      "378fa0594d22f01c8e8931c2a908d7c4",
+      "db38fe2e082bd4a09acb3bb1d52ee11e",
+      "3ad44401cc731215c46c9b7d96f7e4ae",
+      "6c43267be5ed03d204a05fe36090f870",
+      // 128xN
+      "c8cfe46ebf166c1cbf08e8804206aadb",
+      "b0557b5156d2334c8ce4a7ee12f9d6b4",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest8bpp, Blending) {
+  Test(GetAverageBlendDigest8bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest8bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest8bpp(GetParam()),
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockWidthPixels[GetParam()]),
+       false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AverageBlendTest10bpp = AverageBlendTest<10, uint16_t>;
+
+const char* GetAverageBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "98c0671c092b4288adcaaa17362cc4a3",
+      "7083f3def8bfb63ab3a985ef5616a923",
+      "a7211ee2eaa6f88e08875b377d17b0f1",
+      // 8xN
+      "11f9ab881700f2ef0f82d8d4662868c6",
+      "3bee144b9ea6f4288b860c24f88a22f3",
+      "27113bd17bf95034f100e9046c7b59d2",
+      "c42886a5e16e23a81e43833d34467558",
+      // 16xN
+      "b0ac2eb0a7a6596d6d1339074c7f8771",
+      "24c9e079b9a8647a6ee03f5441f2cdd9",
+      "dd05777751ccdb4356856c90e1176e53",
+      "27b1d69d035b1525c013b7373cfe3875",
+      "08c46403afe19e6b008ccc8f56633da9",
+      // 32xN
+      "36d434db11298aba76166df06e9b8125",
+      "efd24dd7b555786bff1a482e51170ea3",
+      "3b37ddac87de443cd18784f02c2d1dd5",
+      "80d8070939a743a20689a65bf5dc0a68",
+      // 64xN
+      "88e747246237c6408d0bd4cc3ecc8396",
+      "af1fe8c52487c9f2951c3ea516828abb",
+      "ea6f18ff56b053748c18032b7e048e83",
+      "af0cb87fe27d24c2e0afd2c90a8533a6",
+      // 128xN
+      "16a83b19911d6dc7278a694b8baa9901",
+      "bd22e77ce6fa727267ff63eeb4dcb19c",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+TEST_P(AverageBlendTest10bpp, Blending) {
+  Test(GetAverageBlendDigest10bpp(GetParam()), 1, false);
+}
+
+TEST_P(AverageBlendTest10bpp, DISABLED_Speed) {
+  Test(GetAverageBlendDigest10bpp(GetParam()),
+       kNumSpeedTests /
+           (kBlockHeightPixels[GetParam()] * kBlockHeightPixels[GetParam()]) /
+           2,
+       false);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AverageBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc
new file mode 100644
index 0000000..ca2adfd
--- /dev/null
+++ b/src/dsp/cdef.cc
@@ -0,0 +1,309 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Silence unused function warnings when CdefDirection_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||        \
+    !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105};
+
+int32_t Square(int32_t x) { return x * x; }
+
+template <int bitdepth, typename Pixel>
+void CdefDirection_C(const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride,
+                     uint8_t* LIBGAV1_RESTRICT const direction,
+                     int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int32_t cost[8] = {};
+  // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We
+  // use int32_t to keep it simple since |cost| will have to be int32_t.
+  int32_t partial[8][15] = {};
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      const int x = (src[j] >> (bitdepth - 8)) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+    src += stride;
+  }
+  for (int i = 0; i < 8; ++i) {
+    cost[2] += Square(partial[2][i]);
+    cost[6] += Square(partial[6][i]);
+  }
+  cost[2] *= kDivisionTable[7];
+  cost[6] *= kDivisionTable[7];
+  for (int i = 0; i < 7; ++i) {
+    cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+               kDivisionTable[i];
+    cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) *
+               kDivisionTable[i];
+  }
+  cost[0] += Square(partial[0][7]) * kDivisionTable[7];
+  cost[4] += Square(partial[4][7]) * kDivisionTable[7];
+  for (int i = 1; i < 8; i += 2) {
+    for (int j = 0; j < 5; ++j) {
+      cost[i] += Square(partial[i][3 + j]);
+    }
+    cost[i] *= kDivisionTable[7];
+    for (int j = 0; j < 3; ++j) {
+      cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) *
+                 kDivisionTable[2 * j + 1];
+    }
+  }
+  int32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefDirection) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        // !defined(LIBGAV1_Dsp10bpp_CdefDirection))
+
+// Silence unused function warnings when CdefFilter_C is obviated.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||      \
+    !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \
+    (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+int Constrain(int diff, int threshold, int damping) {
+  assert(threshold != 0);
+  damping = std::max(0, damping - FloorLog2(threshold));
+  const int sign = (diff < 0) ? -1 : 1;
+  return sign *
+         Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff));
+}
+
+// Filters the source block. It doesn't check whether the candidate pixel is
+// inside the frame. However it requires the source input to be padded with a
+// constant large value (kCdefLargeValue) if at the boundary.
+template <int block_width, int bitdepth, typename Pixel,
+          bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_C(const uint16_t* LIBGAV1_RESTRICT src,
+                  const ptrdiff_t src_stride, const int block_height,
+                  const int primary_strength, const int secondary_strength,
+                  const int damping, const int direction,
+                  void* LIBGAV1_RESTRICT const dest,
+                  const ptrdiff_t dest_stride) {
+  static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  assert(block_height == 4 || block_height == 8);
+  assert(direction >= 0 && direction <= 7);
+  constexpr int coeff_shift = bitdepth - 8;
+  // Section 5.9.19. CDEF params syntax.
+  assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift);
+  assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift &&
+         secondary_strength != 3 << coeff_shift);
+  assert(primary_strength != 0 || secondary_strength != 0);
+  // damping is decreased by 1 for chroma.
+  assert((damping >= 3 && damping <= 6 + coeff_shift) ||
+         (damping >= 2 && damping <= 5 + coeff_shift));
+  // When only primary_strength or secondary_strength are non-zero the number
+  // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and
+  // the taps used don't exceed the amount the sum is
+  // descaled by (16) so we can skip tracking and clipping to the minimum and
+  // maximum value observed.
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0,
+                                                kCdefSecondaryTap1};
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  int y = block_height;
+  do {
+    int x = 0;
+    do {
+      int16_t sum = 0;
+      const uint16_t pixel_value = src[x];
+      uint16_t max_value = pixel_value;
+      uint16_t min_value = pixel_value;
+      for (int k = 0; k < 2; ++k) {
+        static constexpr int signs[] = {-1, 1};
+        for (const int& sign : signs) {
+          if (enable_primary) {
+            const int dy = sign * kCdefDirections[direction][k][0];
+            const int dx = sign * kCdefDirections[direction][k][1];
+            const uint16_t value = src[dy * src_stride + dx + x];
+            // Note: the summation can ignore the condition check in SIMD
+            // implementation, because Constrain() will return 0 when
+            // value == kCdefLargeValue.
+            if (value != kCdefLargeValue) {
+              sum += Constrain(value - pixel_value, primary_strength, damping) *
+                     kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k];
+              if (clipping_required) {
+                max_value = std::max(value, max_value);
+                min_value = std::min(value, min_value);
+              }
+            }
+          }
+
+          if (enable_secondary) {
+            static constexpr int offsets[] = {-2, 2};
+            for (const int& offset : offsets) {
+              const int dy = sign * kCdefDirections[direction + offset][k][0];
+              const int dx = sign * kCdefDirections[direction + offset][k][1];
+              const uint16_t value = src[dy * src_stride + dx + x];
+              // Note: the summation can ignore the condition check in SIMD
+              // implementation.
+              if (value != kCdefLargeValue) {
+                sum += Constrain(value - pixel_value, secondary_strength,
+                                 damping) *
+                       kCdefSecondaryTaps[k];
+                if (clipping_required) {
+                  max_value = std::max(value, max_value);
+                  min_value = std::min(value, min_value);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      const int offset = (8 + sum - (sum < 0)) >> 4;
+      if (clipping_required) {
+        dst[x] = static_cast<Pixel>(
+            Clip3(pixel_value + offset, min_value, max_value));
+      } else {
+        dst[x] = static_cast<Pixel>(pixel_value + offset);
+      }
+    } while (++x < block_width);
+
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_CdefFilters) ||
+        // (LIBGAV1_MAX_BITDEPTH >= 10 &&
+        // !defined(LIBGAV1_Dsp10bpp_CdefFilters))
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>;
+  dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>;
+  dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true,
+                                         /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_CdefDirection
+  dsp->cdef_direction = CdefDirection_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_CdefFilters
+  dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] =
+      CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true,
+                   /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] =
+      CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void CdefInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h
new file mode 100644
index 0000000..b820b77
--- /dev/null
+++ b/src/dsp/cdef.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CDEF_H_
+#define LIBGAV1_SRC_DSP_CDEF_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/cdef_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
+#include "src/dsp/x86/cdef_sse4.h"
+// clang-format on
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CDEF_H_
diff --git a/src/dsp/cdef.inc b/src/dsp/cdef.inc
new file mode 100644
index 0000000..c1a3136
--- /dev/null
+++ b/src/dsp/cdef.inc
@@ -0,0 +1,29 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants used for cdef implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2;
+
+// Mirror values and pad to 16 elements.
+alignas(16) constexpr uint32_t kCdefDivisionTable[] = {
+    840, 420, 280, 210, 168, 140, 120, 105,
+    120, 140, 168, 210, 280, 420, 840, 0};
+
+// Used when calculating odd |cost[x]| values to mask off unwanted elements.
+// Holds elements 1 3 5 X 5 3 1 X
+alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0,
+                                                          140, 210, 420, 0};
diff --git a/src/dsp/cdef_test.cc b/src/dsp/cdef_test.cc
new file mode 100644
index 0000000..c10a8d7
--- /dev/null
+++ b/src/dsp/cdef_test.cc
@@ -0,0 +1,401 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kCdefDirectionName[] = "Cdef Direction";
+constexpr char kCdefFilterName[] = "Cdef Filtering";
+constexpr int kTestBufferStride = 8;
+constexpr int kTestBufferSize = 64;
+constexpr int kSourceStride = kMaxSuperBlockSizeInPixels + 2 * 8;
+constexpr int kSourceBufferSize =
+    (kMaxSuperBlockSizeInPixels + 2 * 3) * kSourceStride;
+constexpr int kNumSpeedTests = 5000;
+
+const char* GetDirectionDigest(const int bitdepth, const int num_runs) {
+  static const char* const kDigest[2][2] = {
+      {"de78c820a1fec7e81385aa0a615dbf8c", "7bfc543244f932a542691480dc4541b2"},
+      {"b54236de5d25e16c0f8678d9784cb85e", "559144cf183f3c69cb0e5d98cbf532ff"}};
+  const int bitdepth_index = (bitdepth == 8) ? 0 : 1;
+  const int run_index = (num_runs == 1) ? 0 : 1;
+  return kDigest[bitdepth_index][run_index];
+}
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+template <int bitdepth, typename Pixel>
+class CdefDirectionTest : public testing::TestWithParam<int> {
+ public:
+  CdefDirectionTest() = default;
+  CdefDirectionTest(const CdefDirectionTest&) = delete;
+  CdefDirectionTest& operator=(const CdefDirectionTest&) = delete;
+  ~CdefDirectionTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cdef_direction_ = nullptr;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        CdefInit_AVX2();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cdef_direction_ = dsp->cdef_direction;
+  }
+
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kTestBufferSize];
+  int strength_;
+  int size_;
+
+  CdefDirectionFunc base_cdef_direction_;
+  CdefDirectionFunc cur_cdef_direction_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefDirectionTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (cur_cdef_direction_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  libvpx_test::MD5 actual_digest;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int level = 0; level < (1 << bitdepth); level += 1 + (bitdepth - 8)) {
+      for (int bits = 0; bits <= bitdepth; ++bits) {
+        for (auto& pixel : buffer_) {
+          pixel = Clip3((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                        (1 << bitdepth) - 1);
+        }
+        int output[2] = {};
+        const absl::Time start = absl::Now();
+        cur_cdef_direction_(buffer_, kTestBufferStride * sizeof(Pixel),
+                            reinterpret_cast<uint8_t*>(&output[0]), &output[1]);
+        elapsed_time += absl::Now() - start;
+        actual_digest.Add(reinterpret_cast<const uint8_t*>(output),
+                          sizeof(output));
+      }
+    }
+  }
+  test_utils::CheckMd5Digest(kCdef, kCdefDirectionName,
+                             GetDirectionDigest(bitdepth, num_runs),
+                             actual_digest.Get(), elapsed_time);
+}
+
+using CdefDirectionTest8bpp = CdefDirectionTest<8, uint8_t>;
+
+TEST_P(CdefDirectionTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest8bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefDirectionTest8bpp, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefDirectionTest8bpp, testing::Values(0));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefDirectionTest10bpp = CdefDirectionTest<10, uint16_t>;
+
+TEST_P(CdefDirectionTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefDirectionTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests / 100);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefDirectionTest10bpp, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefDirectionTest10bpp, testing::Values(0));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "b6fe1a1f5bbb23e35197160ce57d90bd", "8aed39871b19184f1d381b145779bc33",
+      "82653dd66072e8ebd967083a0413ab03", "421c048396bc66ffaa6aafa016c7bc54",
+      "1f70ba51091e8c6034c3f0974af241c3", "8f700997452a24091136ca58890a5be4",
+      "9e3dea21ee4246172121f0420eccd899", "0848bdeffa74145758ef47992e1035c4",
+      "0bb55818de986e9d988b0c1cc6883887", "9b558a7eefc934f90cd09ca26b998bfd",
+      "3a38670f8c5f0c61cc47c9c79da728d2", "ed18fe91180e78008ccb98e9019bed69",
+      "2aa4bbcb6fb088ad42bde76be014dff0", "88f746f0d6c079ab8e9ecc7ff67524c7",
+      "7cffa948f5ddbccc7c6b07d15ca9eb69", "5e22c1c89735965dda935d1249129548",
+      "e765133d133b94e1578c8c5616248a96", "da95d47cad74eb4a075893ca98e658ab",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "0a9630b39974850998db653b07e09ab4", "97a924661d931b23ee57893da617ae70",
+      "0d79516b9a491ce5112eb00bbae5eb80", "d5801fd96029a7509cf66dde61e8e2d8",
+      "5bf5c0ea5a85e9b6c1e6991619c34ebc", "e2f1c08a8b3cd93b3a85511493a0ee31",
+      "45c047d2be5e2dcf6094937780a3f88a", "346caf437c1ad85862de72a622e29845",
+      "0e9cb69d24d9badbe956da779d912b05", "81803dcb00971237b3fe6372564a842f",
+      "17681ad2ed4a2456d70760852af6c6fd", "5312f8049a08a5f9b1708fda936f7a55",
+      "3f0f522f3a33e4ff2a97bdc1e614c5c4", "3818a50be7fe16aa0c636a7392d1eceb",
+      "c6849b8cd77a076dc7e3c26e8cd55b9e", "223c0dd685bbc74aec1d088356708433",
+      "90992957cb8103222aa2fb43c6cd2fc4", "a4ba6edcefe4130851c4c2607b147f95",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct CdefTestParam {
+  CdefTestParam(int subsampling_x, int subsampling_y, int rows4x4,
+                int columns4x4)
+      : subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        rows4x4(rows4x4),
+        columns4x4(columns4x4) {}
+  int subsampling_x;
+  int subsampling_y;
+  int rows4x4;
+  int columns4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const CdefTestParam& param) {
+  return os << "subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y << ", (rows,columns)4x4: " << param.rows4x4
+            << ", " << param.columns4x4;
+}
+
+// TODO(b/154245961): rework the parameters for this test to match
+// CdefFilteringFuncs. It should cover 4x4, 8x4, 8x8 blocks and
+// primary/secondary strength combinations for both Y and UV.
+template <int bitdepth, typename Pixel>
+class CdefFilteringTest : public testing::TestWithParam<CdefTestParam> {
+ public:
+  CdefFilteringTest() = default;
+  CdefFilteringTest(const CdefFilteringTest&) = delete;
+  CdefFilteringTest& operator=(const CdefFilteringTest&) = delete;
+  ~CdefFilteringTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    CdefInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      CdefInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      CdefInit_SSE4_1();
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        CdefInit_AVX2();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    memcpy(cur_cdef_filter_, dsp->cdef_filters, sizeof(cur_cdef_filter_));
+  }
+
+  void TestRandomValues(int num_runs);
+
+  uint16_t source_[kSourceBufferSize];
+  Pixel dest_[kMaxPlanes][kTestBufferSize];
+  int primary_strength_;
+  int secondary_strength_;
+  int damping_;
+  int direction_;
+  CdefTestParam param_ = GetParam();
+
+  CdefFilteringFuncs cur_cdef_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void CdefFilteringTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  const int id = static_cast<int>(param_.rows4x4 < 4) * 3 +
+                 (param_.subsampling_x + param_.subsampling_y) * 6;
+  absl::Duration elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int subsampling_x = (plane == kPlaneY) ? 0 : param_.subsampling_x;
+      const int subsampling_y = (plane == kPlaneY) ? 0 : param_.subsampling_y;
+      const int block_width = 8 >> subsampling_x;
+      const int block_height = 8 >> subsampling_y;
+      libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                                 id + plane);
+      const int offset = 2 * kSourceStride + 2;
+      // Fill boundaries with a large value such that cdef does not take them
+      // into calculation.
+      const int plane_width = MultiplyBy4(param_.columns4x4) >> subsampling_x;
+      const int plane_height = MultiplyBy4(param_.rows4x4) >> subsampling_y;
+      for (int y = 0; y < plane_height; ++y) {
+        for (int x = 0; x < plane_width; ++x) {
+          source_[y * kSourceStride + x + offset] =
+              rnd.Rand16() & ((1 << bitdepth) - 1);
+        }
+      }
+      for (int y = 0; y < 2; ++y) {
+        Memset(&source_[y * kSourceStride], kCdefLargeValue, kSourceStride);
+        Memset(&source_[(y + plane_height + 2) * kSourceStride],
+               kCdefLargeValue, kSourceStride);
+      }
+      for (int y = 0; y < plane_height; ++y) {
+        Memset(&source_[y * kSourceStride + offset - 2], kCdefLargeValue, 2);
+        Memset(&source_[y * kSourceStride + offset + plane_width],
+               kCdefLargeValue, 2);
+      }
+      do {
+        int strength = rnd.Rand16() & 15;
+        if (strength == 3) ++strength;
+        primary_strength_ = strength << (bitdepth - 8);
+      } while (primary_strength_ == 0);
+      do {
+        int strength = rnd.Rand16() & 3;
+        if (strength == 3) ++strength;
+        secondary_strength_ = strength << (bitdepth - 8);
+      } while (secondary_strength_ == 0);
+      damping_ = (rnd.Rand16() & 3) + 3;
+      direction_ = (rnd.Rand16() & 7);
+
+      memset(dest_[plane], 0, sizeof(dest_[plane]));
+      const absl::Time start = absl::Now();
+      const int width_index = block_width >> 3;
+      if (cur_cdef_filter_[width_index][0] == nullptr) return;
+      cur_cdef_filter_[width_index][0](
+          source_ + offset, kSourceStride, block_height, primary_strength_,
+          secondary_strength_, damping_, direction_, dest_[plane],
+          kTestBufferStride * sizeof(dest_[0][0]));
+      elapsed_time += absl::Now() - start;
+    }
+  }
+
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    if (bitdepth == 8) {
+      test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+                                 GetDigest8bpp(id + plane),
+                                 reinterpret_cast<uint8_t*>(dest_[plane]),
+                                 sizeof(dest_[plane]), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    } else {
+      test_utils::CheckMd5Digest(kCdef, kCdefFilterName,
+                                 GetDigest10bpp(id + plane),
+                                 reinterpret_cast<uint8_t*>(dest_[plane]),
+                                 sizeof(dest_[plane]), elapsed_time);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+  }
+}
+
+// Do not test single blocks with any subsampling. 2xH and Wx2 blocks are not
+// supported.
+const CdefTestParam cdef_test_param[] = {
+    CdefTestParam(0, 0, 4, 4), CdefTestParam(0, 0, 2, 2),
+    CdefTestParam(1, 0, 4, 4), CdefTestParam(1, 0, 2, 2),
+    CdefTestParam(1, 1, 4, 4), CdefTestParam(1, 1, 2, 2),
+};
+
+using CdefFilteringTest8bpp = CdefFilteringTest<8, uint8_t>;
+
+TEST_P(CdefFilteringTest8bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest8bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, CdefFilteringTest8bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using CdefFilteringTest10bpp = CdefFilteringTest<10, uint16_t>;
+
+TEST_P(CdefFilteringTest10bpp, Correctness) { TestRandomValues(1); }
+
+TEST_P(CdefFilteringTest10bpp, DISABLED_Speed) {
+  TestRandomValues(kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, CdefFilteringTest10bpp,
+                         testing::ValuesIn(cdef_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CdefFilteringTest10bpp,
+                         testing::ValuesIn(cdef_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/common.h b/src/dsp/common.h
new file mode 100644
index 0000000..d614a81
--- /dev/null
+++ b/src/dsp/common.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_COMMON_H_
+#define LIBGAV1_SRC_DSP_COMMON_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum { kSgrStride = kRestorationUnitWidth + 32 };  // anonymous enum
+
+// Self guided projection filter.
+struct SgrProjInfo {
+  int index;
+  int multiplier[2];
+};
+
+struct WienerInfo {
+  static const int kVertical = 0;
+  static const int kHorizontal = 1;
+  int16_t number_leading_zero_coefficients[2];
+  alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2];
+};
+
+struct RestorationUnitInfo : public MaxAlignedAllocable {
+  LoopRestorationType type;
+  SgrProjInfo sgr_proj_info;
+  WienerInfo wiener_info;
+};
+
+struct SgrBuffer {
+  alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride];
+  alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride];
+  alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
+  alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
+  // The following 2 buffers are only used by the C functions. Since SgrBuffer
+  // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+  // it's OK to always keep the following 2 buffers.
+  alignas(kMaxAlignment) uint8_t ma[kSgrStride];  // [0, 255]
+  // b is less than 2^16 for 8-bit. However, making it a template slows down the
+  // C function by 5%. So b is fixed to 32-bit.
+  alignas(kMaxAlignment) uint32_t b[kSgrStride];
+};
+
+union RestorationBuffer {
+  // For self-guided filter.
+  SgrBuffer sgr_buffer;
+  // For wiener filter.
+  // The array |intermediate| in Section 7.17.4, the intermediate results
+  // between the horizontal and vertical filters.
+  alignas(kMaxAlignment) int16_t
+      wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) *
+                    kRestorationUnitWidth];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_COMMON_H_
diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc
new file mode 100644
index 0000000..1b85795
--- /dev/null
+++ b/src/dsp/constants.cc
@@ -0,0 +1,103 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/constants.h"
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// Each set of 7 taps is padded with a 0 to easily align and pack into the high
+// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+    {{-6, 10, 0, 0, 0, 12, 0, 0},
+     {-5, 2, 10, 0, 0, 9, 0, 0},
+     {-3, 1, 1, 10, 0, 7, 0, 0},
+     {-3, 1, 1, 2, 10, 5, 0, 0},
+     {-4, 6, 0, 0, 0, 2, 12, 0},
+     {-3, 2, 6, 0, 0, 2, 9, 0},
+     {-3, 2, 2, 6, 0, 2, 7, 0},
+     {-3, 1, 2, 2, 6, 3, 5, 0}},
+    {{-10, 16, 0, 0, 0, 10, 0, 0},
+     {-6, 0, 16, 0, 0, 6, 0, 0},
+     {-4, 0, 0, 16, 0, 4, 0, 0},
+     {-2, 0, 0, 0, 16, 2, 0, 0},
+     {-10, 16, 0, 0, 0, 0, 10, 0},
+     {-6, 0, 16, 0, 0, 0, 6, 0},
+     {-4, 0, 0, 16, 0, 0, 4, 0},
+     {-2, 0, 0, 0, 16, 0, 2, 0}},
+    {{-8, 8, 0, 0, 0, 16, 0, 0},
+     {-8, 0, 8, 0, 0, 16, 0, 0},
+     {-8, 0, 0, 8, 0, 16, 0, 0},
+     {-8, 0, 0, 0, 8, 16, 0, 0},
+     {-4, 4, 0, 0, 0, 0, 16, 0},
+     {-4, 0, 4, 0, 0, 0, 16, 0},
+     {-4, 0, 0, 4, 0, 0, 16, 0},
+     {-4, 0, 0, 0, 4, 0, 16, 0}},
+    {{-2, 8, 0, 0, 0, 10, 0, 0},
+     {-1, 3, 8, 0, 0, 6, 0, 0},
+     {-1, 2, 3, 8, 0, 4, 0, 0},
+     {0, 1, 2, 3, 8, 2, 0, 0},
+     {-1, 4, 0, 0, 0, 3, 10, 0},
+     {-1, 3, 4, 0, 0, 4, 6, 0},
+     {-1, 2, 3, 4, 0, 4, 4, 0},
+     {-1, 2, 2, 3, 4, 3, 3, 0}},
+    {{-12, 14, 0, 0, 0, 14, 0, 0},
+     {-10, 0, 14, 0, 0, 12, 0, 0},
+     {-9, 0, 0, 14, 0, 11, 0, 0},
+     {-8, 0, 0, 0, 14, 10, 0, 0},
+     {-10, 12, 0, 0, 0, 0, 14, 0},
+     {-9, 1, 12, 0, 0, 0, 12, 0},
+     {-8, 0, 0, 12, 0, 1, 11, 0},
+     {-7, 0, 0, 1, 12, 1, 9, 0}}};
+
+// A lookup table replacing the calculation of the variable s in Section 7.17.3
+// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set
+// syntax element in the Spec, saved in the sgr_proj_info.index field of a
+// RestorationUnitInfo struct). The second index is pass (0 or 1).
+//
+// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1];
+// const uint32_t n2_with_scale = n * n * scale;
+// const uint32_t s =
+// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale;
+// 0 is an invalid value, corresponding to radius = 0, where the filter is
+// skipped.
+const uint16_t kSgrScaleParameter[16][2] = {
+    {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177},
+    {47, 1079},  {37, 996},   {30, 925},  {25, 863},  {0, 2589},  {0, 1618},
+    {0, 1177},   {0, 925},    {56, 0},    {22, 0},
+};
+
+const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}};
+
+// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the
+// beginning and end of the table. The cdef direction range is [0, 7] and the
+// first index is offset +/-2. This removes the need to constrain the first
+// index to the same range using e.g., & 7.
+const int8_t kCdefDirectionsPadded[12][2][2] = {
+    {{1, 0}, {2, 0}},    // Padding: Cdef_Directions[6]
+    {{1, 0}, {2, -1}},   // Padding: Cdef_Directions[7]
+    {{-1, 1}, {-2, 2}},  // Begin Cdef_Directions
+    {{0, 1}, {-1, 2}},   //
+    {{0, 1}, {0, 2}},    //
+    {{0, 1}, {1, 2}},    //
+    {{1, 1}, {2, 2}},    //
+    {{1, 0}, {2, 1}},    //
+    {{1, 0}, {2, 0}},    //
+    {{1, 0}, {2, -1}},   // End Cdef_Directions
+    {{-1, 1}, {-2, 2}},  // Padding: Cdef_Directions[0]
+    {{0, 1}, {-1, 2}},   // Padding: Cdef_Directions[1]
+};
+
+}  // namespace libgav1
diff --git a/src/dsp/constants.h b/src/dsp/constants.h
new file mode 100644
index 0000000..7c1b62c
--- /dev/null
+++ b/src/dsp/constants.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_
+#define LIBGAV1_SRC_DSP_CONSTANTS_H_
+
+// This file contains DSP related constants that have a direct relationship with
+// a DSP component.
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+enum {
+  // Documentation variables.
+  kBitdepth8 = 8,
+  kBitdepth10 = 10,
+  kBitdepth12 = 12,
+  // Weights are quadratic from '1' to '1 / block_size', scaled by
+  // 2^kSmoothWeightScale.
+  kSmoothWeightScale = 8,
+  kCflLumaBufferStride = 32,
+  // InterRound0, Section 7.11.3.2.
+  kInterRoundBitsHorizontal = 3,  // 8 & 10-bit.
+  kInterRoundBitsHorizontal12bpp = 5,
+  kInterRoundBitsCompoundVertical = 7,  // 8, 10 & 12-bit compound prediction.
+  kInterRoundBitsVertical = 11,         // 8 & 10-bit, single prediction.
+  kInterRoundBitsVertical12bpp = 9,
+  // Offset applied to 10bpp and 12bpp predictors to allow storing them in
+  // uint16_t. Removed before blending.
+  kCompoundOffset = (1 << 14) + (1 << 13),
+  kCdefSecondaryTap0 = 2,
+  kCdefSecondaryTap1 = 1,
+};  // anonymous enum
+
+extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8];
+
+// Values in this enum can be derived as the sum of subsampling_x and
+// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never
+// allowed by the bitstream).
+enum SubsamplingType : uint8_t {
+  kSubsamplingType444,  // subsampling_x = 0, subsampling_y = 0.
+  kSubsamplingType422,  // subsampling_x = 1, subsampling_y = 0.
+  kSubsamplingType420,  // subsampling_x = 1, subsampling_y = 1.
+  kNumSubsamplingTypes
+};
+
+extern const uint16_t kSgrScaleParameter[16][2];
+
+extern const uint8_t kCdefPrimaryTaps[2][2];
+
+extern const int8_t kCdefDirectionsPadded[12][2][2];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONSTANTS_H_
diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc
new file mode 100644
index 0000000..f11b45e
--- /dev/null
+++ b/src/dsp/convolve.cc
@@ -0,0 +1,879 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kHorizontalOffset = 3;
+constexpr int kVerticalOffset = 3;
+
+// Compound prediction output ranges from ConvolveTest.ShowRange.
+// In some cases, the horizontal or vertical filter will be omitted. This table
+// shows the general case, where the downscaled horizontal output is input to
+// the vertical filter via the |intermediate_result| array. The final output is
+// either Pixel or compound values, depending on the |is_compound| variable.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   Horizontal upscaled range:         [   -7140,    23460]
+//   Horizontal downscaled range:       [   -1785,     5865]
+//   Vertical upscaled range:           [ -328440,   589560]
+//   Pixel output range:                [       0,      255]
+//   Compound output range:             [   -5132,     9212]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   Horizontal upscaled range:         [  -28644,    94116]
+//   Horizontal downscaled range:       [   -7161,    23529]
+//   Vertical upscaled range:           [-1317624,  2365176]
+//   Pixel output range:                [       0,     1023]
+//   Compound output range:             [    3988,    61532]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   Horizontal upscaled range:         [ -114660,   376740]
+//   Horizontal downscaled range:       [   -7166,    23546]
+//   Vertical upscaled range:           [-1318560,  2366880]
+//   Pixel output range:                [       0,     4095]
+//   Compound output range:             [    3974,    61559]
+
+template <int bitdepth, typename Pixel>
+void ConvolveScale2D_C(const void* LIBGAV1_RESTRICT const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index, const int subpixel_x,
+                       const int subpixel_y, const int step_x, const int step_y,
+                       const int width, const int height,
+                       void* LIBGAV1_RESTRICT prediction,
+                       const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundScale2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int subpixel_x, const int subpixel_y,
+    const int step_x, const int step_y, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      kSubPixelTaps;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (2 * kMaxSuperBlockSizeInPixels + 8)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  // Note: assume the input src is already aligned to the correct start
+  // position.
+  int y = 0;
+  do {
+    int p = subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      p += step_x;
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  int p = subpixel_y & 1023;
+  y = 0;
+  do {
+    const int filter_id = (p >> 6) & kSubPixelMask;
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum +=
+            kHalfSubPixelFilters[filter_index][filter_id][k] *
+            intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride +
+                         x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    p += step_y;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompound2D_C(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index,
+                          const int horizontal_filter_id,
+                          const int vertical_filter_id, const int width,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
+                          const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    dest += pred_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is single prediction mode, where both horizontal and
+// vertical filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void Convolve2D_C(const void* LIBGAV1_RESTRICT const reference,
+                  const ptrdiff_t reference_stride,
+                  const int horizontal_filter_index,
+                  const int vertical_filter_index,
+                  const int horizontal_filter_id, const int vertical_filter_id,
+                  const int width, const int height,
+                  void* LIBGAV1_RESTRICT prediction,
+                  const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  const int intermediate_height = height + kSubPixelTaps - 1;
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  int16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                              (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
+  const int max_pixel_value = (1 << bitdepth) - 1;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src = static_cast<const Pixel*>(reference) -
+                    kVerticalOffset * src_stride - kHorizontalOffset;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += intermediate_stride;
+  } while (++y < intermediate_height);
+
+  // Vertical filter.
+  filter_index = GetFilterIndex(vertical_filter_index, height);
+  intermediate = intermediate_result;
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               intermediate[k * intermediate_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    dest += dest_stride;
+    intermediate += intermediate_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only horizontal
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveHorizontal_C(const void* LIBGAV1_RESTRICT const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int /*vertical_filter_index*/,
+                          const int horizontal_filter_id,
+                          const int /*vertical_filter_id*/, const int width,
+                          const int height, void* LIBGAV1_RESTRICT prediction,
+                          const ptrdiff_t pred_stride) {
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int bits = kFilterBits - kRoundBitsHorizontal;
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of Convolve2D_C.
+// It is called when it is single prediction mode, where only vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveVertical_C(const void* LIBGAV1_RESTRICT const reference,
+                        const ptrdiff_t reference_stride,
+                        const int /*horizontal_filter_index*/,
+                        const int vertical_filter_index,
+                        const int /*horizontal_filter_id*/,
+                        const int vertical_filter_id, const int width,
+                        const int height, void* LIBGAV1_RESTRICT prediction,
+                        const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+
+  const int max_pixel_value = (1 << bitdepth) - 1;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
+                      max_pixel_value);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCopy_C(const void* LIBGAV1_RESTRICT const reference,
+                    const ptrdiff_t reference_stride,
+                    const int /*horizontal_filter_index*/,
+                    const int /*vertical_filter_index*/,
+                    const int /*horizontal_filter_id*/,
+                    const int /*vertical_filter_id*/, const int width,
+                    const int height, void* LIBGAV1_RESTRICT prediction,
+                    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  int y = 0;
+  do {
+    memcpy(dest, src, width * sizeof(Pixel));
+    src += reference_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundCopy_C(const void* LIBGAV1_RESTRICT const reference,
+                            const ptrdiff_t reference_stride,
+                            const int /*horizontal_filter_index*/,
+                            const int /*vertical_filter_index*/,
+                            const int /*horizontal_filter_id*/,
+                            const int /*vertical_filter_id*/, const int width,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
+                            const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsVertical =
+      ((bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                        : kInterRoundBitsVertical) -
+      kInterRoundBitsCompoundVertical;
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1)));
+      sum += src[x];
+      dest[x] = sum << kRoundBitsVertical;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only horizontal
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundHorizontal_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(horizontal_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is a simplified version of ConvolveCompound2D_C.
+// It is called when it is compound prediction mode, where only vertical
+// filtering is required.
+// The output is not clipped to valid pixel range. Its output will be
+// blended with another predictor to generate the final prediction of the block.
+template <int bitdepth, typename Pixel>
+void ConvolveCompoundVertical_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  const auto* src =
+      static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  // Copy filters must call ConvolveCopy().
+  assert(vertical_filter_id != 0);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      int sum = 0;
+      for (int k = 0; k < kSubPixelTaps; ++k) {
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
+               src[k * src_stride + x];
+      }
+      sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
+      sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+      dest[x] = sum;
+    } while (++x < width);
+    src += src_stride;
+    dest += pred_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from current frame and both horizontal and vertical
+// filtering are required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+template <int bitdepth, typename Pixel>
+void ConvolveIntraBlockCopy2D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const int intermediate_height = height + 1;
+  uint16_t intermediate_result[kMaxSuperBlockSizeInPixels *
+                               (kMaxSuperBlockSizeInPixels + 1)];
+  uint16_t* intermediate = intermediate_result;
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      intermediate[x] = src[x] + src[x + 1];
+    } while (++x < width);
+
+    src += src_stride;
+    intermediate += width;
+  } while (++y < intermediate_height);
+
+  intermediate = intermediate_result;
+  y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] =
+          RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2);
+    } while (++x < width);
+
+    intermediate += width;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+// This function is used when intra block copy is present.
+// It is called when it is single prediction mode for U/V plane, where the
+// reference block is from the current frame and only horizontal or vertical
+// filtering is required.
+// The output is the single prediction of the block, clipped to valid pixel
+// range.
+// The filtering of intra block copy is simply the average of current and
+// the next pixel.
+template <int bitdepth, typename Pixel, bool is_horizontal>
+void ConvolveIntraBlockCopy1D_C(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
+  const auto* src = static_cast<const Pixel*>(reference);
+  const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
+  auto* dest = static_cast<Pixel*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
+  const ptrdiff_t offset = is_horizontal ? 1 : src_stride;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1);
+    } while (++x < width);
+
+    src += src_stride;
+    dest += dest_stride;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#else  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy
+  dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical
+  dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Convolve2D
+  dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy
+  dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal
+  dsp->convolve[1][0][0][1] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical
+  dsp->convolve[1][0][1][0] =
+      ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>;
+#endif
+
+  dsp->convolve[1][1][0][0] = nullptr;
+  dsp->convolve[1][1][0][1] = nullptr;
+  dsp->convolve[1][1][1][0] = nullptr;
+  dsp->convolve[1][1][1][1] = nullptr;
+
+#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D
+  dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D
+  dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void ConvolveInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h
new file mode 100644
index 0000000..5bc0bad
--- /dev/null
+++ b/src/dsp/convolve.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_
+#define LIBGAV1_SRC_DSP_CONVOLVE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/convolve_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
+#include "src/dsp/x86/convolve_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not
+// thread-safe.
+void ConvolveInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_CONVOLVE_H_
diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc
new file mode 100644
index 0000000..e0f755e
--- /dev/null
+++ b/src/dsp/convolve.inc
@@ -0,0 +1,51 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+constexpr int kIntermediateAllocWidth = kMaxSuperBlockSizeInPixels;
+constexpr int kIntermediateStride = 8;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
diff --git a/src/dsp/convolve_test.cc b/src/dsp/convolve_test.cc
new file mode 100644
index 0000000..295c814
--- /dev/null
+++ b/src/dsp/convolve_test.cc
@@ -0,0 +1,1327 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The convolve function will access at most (block_height + 7) rows/columns
+// from the beginning.
+constexpr int kMaxBlockWidth = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+constexpr int kMaxBlockHeight = kMaxSuperBlockSizeInPixels + kSubPixelTaps;
+
+// Test all the filters in |kSubPixelFilters|. There are 6 different filters but
+// filters [4] and [5] are only reached through GetFilterIndex().
+constexpr int kMinimumViableRuns = 4 * 16;
+
+struct ConvolveTestParam {
+  enum BlockSize {
+    kBlockSize2x2,
+    kBlockSize2x4,
+    kBlockSize4x2,
+    kBlockSize4x4,
+    kBlockSize4x8,
+    kBlockSize8x2,
+    kBlockSize8x4,
+    kBlockSize8x8,
+    kBlockSize8x16,
+    kBlockSize16x8,
+    kBlockSize16x16,
+    kBlockSize16x32,
+    kBlockSize32x16,
+    kBlockSize32x32,
+    kBlockSize32x64,
+    kBlockSize64x32,
+    kBlockSize64x64,
+    kBlockSize64x128,
+    kBlockSize128x64,
+    kBlockSize128x128,
+    kNumBlockSizes
+  };
+
+  static constexpr int kBlockWidth[kNumBlockSizes] = {
+      2, 2, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128};
+  static constexpr int kBlockHeight[kNumBlockSizes] = {
+      2, 4, 2, 4, 8, 2, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128};
+
+  explicit ConvolveTestParam(BlockSize block_size)
+      : block_size(block_size),
+        width(kBlockWidth[block_size]),
+        height(kBlockHeight[block_size]) {}
+
+  BlockSize block_size;
+  int width;
+  int height;
+};
+
+#if !LIBGAV1_CXX17
+constexpr int ConvolveTestParam::kBlockWidth[kNumBlockSizes];   // static.
+constexpr int ConvolveTestParam::kBlockHeight[kNumBlockSizes];  // static.
+#endif
+
+const char* GetConvolveDigest8bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+      "ae5977a4ceffbac0cde72a04a43a9d57", "6cf5f791fe0d8dcd3526be3c6b814035",
+      "d905dfcad930aded7718587c05b48aaf", "6baf153feff04cc5b7e87c0bb60a905d",
+      "871ed5a69ca31e6444faa720895949bf", "c9cf1deba08dac5972b3b0a43eff8f98",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+      "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+      "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+      "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+      "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+      "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+      "1168251e6261e2ff1fa69a93226dbd76", "4821befdf63f8c6da6440afeb57f320f",
+      "c30fc44d83821141e84cc4793e127301", "a8293b933d9f2e5d7f922ea40111d643",
+      "354a54861a94e8b027afd9931e61f997", "b384e9e3d81f9f4f9024028fbe451d8b",
+      "eeeb8589c1b31cbb565154736ca939ec", "f49dab626ddd977ed171f79295c24935",
+      "78d2f27e0d4708cb16856d7d40dc16fb", "9d2393ea156a1c2083f5b4207793064b",
+      "a9c62745b95c66fa497a524886af57e2", "2c614ec4463386ec075a0f1dbb587933",
+      "7a8856480d752153370240b066b90f6a", "beaef1dbffadc701fccb7c18a03e3a41",
+      "72b1e700c949d06eaf62d664dafdb5b6", "684f5c3a25a080edaf79add6e9137a8e",
+      "3be970f49e4288988818b087201d54da", "d2b9dba2968894a414756bb510ac389a",
+      "9a3215eb97aedbbddd76c7440837d040", "4e317feac6da46addf0e8b9d8d54304b",
+      "d2f5ca2b7958c332a3fb771f66da01f0", "7aec92c3b65e456b64ae285c12b03b0d",
+      "f72a99ad63f6a88c23724e898b705d21", "07a1f07f114c4a38ba08d2f44e1e1132",
+      "26b9de95edb45b31ac5aa19825831c7a", "4e4677a0623d44237eb8d6a622cdc526",
+      "c1b836a6ce023663b90db0e320389414", "5befcf222152ebc8d779fcc10b95320a",
+      "62adf407fc27d8682ced4dd7b55af14e", "35be0786a072bf2f1286989261bf6580",
+      "90562fc42dc5d879ae74c4909c1dec30", "a1427352f9e413975a0949e2b300c657",
+      "bcbc418bc2beb243e463851cd95335a9", "cb8fedcbecee3947358dc61f95e56530",
+      "0d0154a7d573685285a83a4cf201ac57", "b14bd8068f108905682b83cc15778065",
+      "c96c867d998473197dde9b587be14e3a", "f596c63c7b14cada0174e17124c83942",
+      "eb2822ad8204ed4ecbf0f30fcb210498", "538ce869ffd23b6963e61badfab7712b",
+      "6bbcc075f8b768a02cdc9149f150326d", "4ae70d9db2ec36885394db7d59bdd4f7",
+      "5fee162fe52c11c823db4d5ede370654", "9365186c59ef66d9def40f437022ad93",
+      "0f95fb0276c9c7910937fbdf75f2811d", "356d4003477283e157c8d2b5a79d913c",
+      "b355dab2dbb6f5869018563eece22862", "cf6ff8c43d8059cea6090a23ab66a0ef",
+      "a336f8b7bcf188840ca65c0d0e66518a", "de953f03895923359c6a719e6a537b89",
+      "8463ade9347ed602663e2cec5c4c3fe6", "392de11ffcd5c2ecf3db3480ee135340",
+      "bddd31e3e852712e6244b616622af83d", "30a36245c40d978fc8976b442a8600c3",
+      "93aa662b988b8502e5ea95659eafde59", "70440ba9ee7f9d16d297dbb49e54a56e",
+      "1eb2be4c05b50e427e29c72fa566bff5", "52c0980bae63e8459e82eee7d8af2334",
+      "75e57104d6058cd2bce1d3d8142d273d", "b4c735269ade44419169adbd852d5ddc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a7305087fae23de53d21a6909009ff69",
+      "8dcce009395264379c1a51239f4bb22c", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "8dcce009395264379c1a51239f4bb22c", "d90a69e7bae8aa46ed0e1e5f911d7a07",
+      "6ab4dc87be03be1dcc5d956ca819d938", "6ab4dc87be03be1dcc5d956ca819d938",
+      "8f2afdb2f03cd04ffacd421b958caaa0", "710ccecc103033088d898a2b924551fb",
+      "710ccecc103033088d898a2b924551fb", "a4093e3e5902dd659407ce6471635a4e",
+      "375d7f5358d7a088a498b8b3aaecc0d5", "375d7f5358d7a088a498b8b3aaecc0d5",
+      "08867ea5cc38c705ec52af821bc4736a", "2afb540e8063f58d1b03896486c5e89b",
+      "2afb540e8063f58d1b03896486c5e89b", "6ce47b11d2e60c5d183c84ce9f2e46cc",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a5a1ac658d7ce4a846a32b9fcfaa3475",
+      "2370f4e4a83edf91b7f504bbe4b00e90", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "ae5464066a049622a7a264cdf9394b55", "45368b6db3d1fee739a64b0bc823ea9c",
+      "8dff0f28192d9f8c0bf7fb5405719dd8", "632738ef3ff3021cff45045c41978849",
+      "f7ec43384037e8d6c618e0df826ec029", "a6bc648197781a2dc99c487e66464320",
+      "1112ebd509007154c72c5a485b220b62", "9714c4ce636b6fb0ad05cba246d48c76",
+      "2c93dde8884f09fb5bb5ad6d95cde86d", "a49e6160b5d1b56bc2046963101cd606",
+      "7f084953976111e9f65b57876e7552b1", "0846ec82555b66197c5c45b08240fbcc",
+      "ca7471c126ccd22189e874f0a6e41960", "0802b6318fbd0969a33de8fdfcd07f10",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3b1ceebf0579fcbbfd6136938c595b91",
+      "ecafabcad1045f15d31ce2f3b13132f2", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "5f211eba020e256a5781b203c5aa1d2e", "3b04497634364dd2cd3f2482b5d4b32f",
+      "a8ac7b5dc65ffb758b0643508a0e744e", "561ed8be43c221a561f8885a0d74c7ef",
+      "8159619fc234598c8c75154d80021fd4", "8f43645dce92cf7594aa4822aa53b17d",
+      "b6ccddb7dfa4eddc87b4eff08b5a3195", "b4e605327b28db573d88844a1a09db8d",
+      "15b00a15d1cc6cc96ca85d00b167e4dd", "7bf911888c11a9fefd604b8b9c82e9a1",
+      "bfb69b4d7d4aed73cfa75a0f55b66440", "034d1d62581bd0d840c4cf1e28227931",
+      "8cba849640e9e2859d509bc81ca94acd", "bc79acf2a0fe419194cdb4529bc7dcc8",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "3bfad931bce82335219e0e29c15f2b21",
+      "68a701313d2247d2b32636ebc1f2a008", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "05afe1f40d37a45a97a5e0aadd5066fb", "9e1f0e0bddb58d15d0925eeaede9b84c",
+      "03313cdaa593a1a7b4869010dcc7b241", "88a50d2b4107ee5b5074b2520183f8ac",
+      "ac50ea9f7306da95a5092709442989cf", "739b17591437edffd36799237b962658",
+      "b8a7eb7dd9c216e240517edfc6489397", "75b755f199dbf4a0e5ebbb86c2bd871d",
+      "31b0017ba1110e3d70b020901bc15564", "0a1aa8f5ecfd11ddba080af0051c576a",
+      "536181ee90de883cc383787aec089221", "29f82b0f3e4113944bd28aacd9b8489a",
+      "ee3e76371240d1f1ff811cea6a7d4f63", "17a20dbbf09feae557d40aa5818fbe76",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "6baf153feff04cc5b7e87c0bb60a905d",
+      "871ed5a69ca31e6444faa720895949bf", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "68e2f90eaa0ab5da7e6f5776993f7eea", "f1f8282fb33c30eb68c0c315b7a4bc01",
+      "9412064b0eebf8123f23d74147d04dff", "cc08936effe309ab9a4fa1bf7e28e24e",
+      "36cbef36fa21b98df03536c918bf752a", "9d0da6321cf5311ea0bdd41271763030",
+      "55a10165ee8a660d7dddacf7de558cdd", "ac7fc9f9ea7213743fae5a023faaaf08",
+      "077e1b7b355c7ab3ca40230ee8efd8ea", "7a3e8de2a1caae206cf3e51a86dfd15a",
+      "1ddf9020f18fa7883355cf8c0881186a", "2377dd167ef2707978bed6f10ffd4e76",
+      "f918e0e4422967c6a7e47298135c7ae9", "b2264e129636368b5496760b39e64b7a",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "4cfad2c437084a93ea76913e21c2dd89",
+      "d372f0c17bce98855d6d59fbee814c3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "d99ffd2579eb781c30bc0df7b76ad61e", "4e139e57cbb049a0f4ef816adc48d026",
+      "be53b2507048e7ff50226d15c0b28865", "b73f3c1a10405de89d1f9e812ff73b5a",
+      "c7d51b1f2df49ab83962257e8a5934e5", "159e443d79cc59b11ca4a80aa7aa09be",
+      "6ef14b14882e1465b0482b0e0b16d8ce", "22a8d287b425c870f40c64a50f91ce54",
+      "f1d96db5a2e0a2160df38bd96d28d19b", "637d1e5221422dfe9a6dbcfd7f62ebdd",
+      "f275af4f1f350ffaaf650310cb5dddec", "f81c4d6b001a14584528880fa6988a87",
+      "a5a2f9c2e7759d8a3dec1bc4b56be587", "2317c57ab69a36eb3bf278cf8a8795a3",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "1a0bdfc96a3b9fd904e658f238ab1076",
+      "56d16e54afe205e97527902770e71c71", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "1f7b5b8282ff3cf4d8e8c52d80ef5b4d", "79e9e260a2028c5fe320005c272064b9",
+      "2418ebcdf85551b9ae6e3725f04aae6d", "98bdf907ebacacb734c9eef1ee727c6e",
+      "4dd5672d53c8f359e8f80badaa843dfc", "a1bef519bbf07138e2eec5a91694de46",
+      "df1cb51fe1a937cd7834e973dc5cb814", "317fe65abf81ef3ea07976ef8667baeb",
+      "2da29da97806ae0ee300c5e69c35a4aa", "555475f5d1685638169ab904447e4f13",
+      "b3e3a6234e8045e6182cf90a09f767b2", "849dfeca59074525dea59681a7f88ab4",
+      "39a68af80be11e1682b6f3c4ede33530", "b22d765af176d87e7d3048b4b89b86ad",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b8a710baa6a9fc784909671d450ecd99",
+      "f9e6a56382d8d12da676d6631bb6ef75", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "3bf8e11e18527b16f0d7c0361d74a52d", "b9ff54c6f1e3b41fc7fc0f3fa0e75cf2",
+      "06ef1504f31af5f173d3317866ca57cb", "635e8ee11cf04d73598549234ad732a0",
+      "fab693410d59ee88aa2895527efc31ac", "3041eb26c23a63a587fbec623919e2d2",
+      "c61d99d5daf575664fb7ad64976f4b03", "822f6c4eb5db760468d822b21f48d94d",
+      "3f6fcb9fae3666e085b9e29002a802fc", "d9b9fecd195736a6049c528d4cb886b5",
+      "fed17fc391e6c3db4aa14ea1d6596c87", "d0d3482d981989e117cbb32fc4550267",
+      "39561688bf6680054edbfae6035316ce", "087c5992ca6f829e1ba4ba5332d67947",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest8bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+      "0291a23f2ac4c40b5d8e957e63769904", "1d48447857472d6455af10d5526f6827",
+      "409b2278d6d372248f1891ca0dd12760", "9e416606a3f82fe5bb3f7182e4f42c2d",
+      "e126563f859ddd5c5ffde6f641168fad", "9bad4f1b7e1865f814b6fd5620816ebd",
+      "50e5e5a57185477cb2af83490c33b47c", "3d2fb301c61d7fbd0e21ac263f7ac552",
+      "5920032c6432c80c6e5e61b684018d13", "07ada64d24339488cdce492e6e0c6b0d",
+      "aaf1589aff6d062a87c627ab9ba20e3e", "91adf91bb24d2c4ea3f882bdf7396e33",
+      "1d17a932a68bb1f199f709e7725fe44b", "07716c63afda034cb386511ea25a63b5",
+      "cca17ef3324c41d189e674a059ef1255", "37d17e70619823a606c0b5f74bf2e33b",
+      "ba8ed5474c187c8e8d7f82a6a29ee860", "27663f037973ebe82ec10252a4d91299",
+      "24c27e187e8d5a2bbfa0fef9046d3eb0", "9854fdc91a48e3bd4639edcc940e5c09",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "a71907c60a9f1f81972a2859ae54a805",
+      "817bc3bf0c77abc4186eac39f2320184", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "4e7182a8b226982e2678abcf5f83325d", "50cef7c6e57544135f102226bb95bed9",
+      "225e054dbcfff05b1c8b0792c731449e", "16eb63f03839159f3af0e08be857170f",
+      "c8e5d111a2e3f4487330a8bd893cb894", "4fd99eaf9c160442aab35b9bdc5d275b",
+      "8b0f61bfb30747d4c9215618ac42557c", "1df78022da202cefb9a8100b114152d9",
+      "378466e1eda63dbc03565b78af8e723f", "28ea721411fbf5fc805035be9a384140",
+      "4fed5d4163a3bfcc6726a42f20410b0a", "55abfca0c820771bd926e4b94f66a499",
+      "6c8b8ef0a78859c768e629e1decc0019", "d0ead286b5ba3841d24dd114efbfef0a",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetConvolveDigest10bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 16] = {
+      "b1b6903d60501c7bc11e5285beb26a52", "a7855ed75772d7fa815978a202bbcd9f",
+      "bde291a4e8087c085fe8b3632f4d7351", "238980eebc9e63ae3eea2771c7a70f12",
+      "0eac13431bd7d8a573318408a72246d5", "d05a237ed7a9ca877256b71555b1b8e4",
+      "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+      "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+      "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+      "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+      "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+      "a693b4bd0334a3b98d45e67d3985bb63", "156de3172d9acf3c7f251cd7a18ad461",
+      "e545b8a3ff958f8363c7968cbae96732", "7842b2047356c1417d9d88219707f1a1",
+      "1a487c658d684314d91bb6d961a94672", "94b3e5bcd6b849b66a4571ec3d23f9be",
+      "0635a296be01b7e641de98ee27c33cd2", "82dc120bf8c2043bc5eee81007309ebf",
+      "58c826cad3c14cdf26a649265758c58b", "f166254037c0dfb140f54cd7b08bddfe",
+      "74ab206f14ac5f62653cd3dd71a7916d", "5621caef7cc1d6522903290ccc5c2cb8",
+      "78ec6cf42cce4b1feb65e076c78ca241", "42188e2dbb4e02cd353552ea147ad03f",
+      "f9813870fc27941a7c00a0443d7c2fe7", "20b14a6b5af7aa356963bcaaf23d230d",
+      "9c9c41435697f75fa118b6d6464ee7cb", "38816245ed832ba313fefafcbed1e5c8",
+      "5d34137cc8ddba75347b0fa1d0a91791", "465dcb046a0449b9dfb3e0b297aa3863",
+      "3e787534dff83c22b3033750e448865a", "4c91f676a054d582bcae1ca9adb87a31",
+      "eab5894046a99ad0a1a12c91b0f37bd7", "765b4cfbfc1a4988878c412d53bcb597",
+      "bc63b29ec78c1efec5543885a45bb822", "91d6bdbc62d4bb80c9b371d9704e3c9e",
+      "cecd57396a0033456408f3f3554c6912", "5b37f94ef136c1eb9a6181c19491459c",
+      "716ba3a25b454e44b46caa42622c128c", "9076f58c4ab20f2f06d701a6b53b1c4f",
+      "d3212ab3922f147c3cf126c3b1aa17f6", "b55fea77f0e14a8bf8b6562b766fe91f",
+      "59b578268ff26a1e21c5b4273f73f852", "16761e7c8ba2645718153bed83ae78f6",
+      "a9e9805769fe1baf5c7933793ccca0d8", "553a2c24939dff18ec5833c77f556cfb",
+      "5c1ec75a160c444fa90abf106fa1140e", "2266840f11ac4c066d941ec473b1a54f",
+      "9e194755b2a37b615a517d5f8746dfbb", "bbf86f8174334f0b8d869fd8d58bf92d",
+      "fd1da8d197cb385f7917cd296d67afb9", "a984202c527b757337c605443f376915",
+      "c347f4a58fd784c5e88c1a23e4ff15d2", "29cbaadbff9adf4a3d49bd9900a9dd0b",
+      "c5997b802a6ba1cf5ba1057ddc5baa7e", "4f750f6375524311d260306deb233861",
+      "59f33727e5beeb783a057770bec7b4cd", "0654d72f22306b28d9ae42515845240c",
+      "6c9d7d9e6ef81d76e775a85c53abe209", "a35f435ccc67717a49251a07e62ae204",
+      "c5325015cb0b7c42839ac4aa21803fa0", "f81f31f1585c0f70438c09e829416f20",
+      "ab10b22fb8dd8199040745565b28595d", "0d928d6111f86c60ccefc6c6604d5659",
+      "4ed1a6200912995d4f571bdb7822aa83", "92e31a45513582f386dc9c22a57bbbbd",
+      "6dbf310a9c8d85f76306d6a35545f8af", "80fce29dc82d5857c1ed5ef2aea16835",
+      "14f2c5b9d2cd621c178a39f1ec0c38eb", "da54cfb4530841bda29966cfa05f4879",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "7e3fa9c03bc3dfbdeb67f24c5d9a49cd",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "f3454ca93cbb0c8c09b0695d90a0df3d", "1a77d2af4d2b6cf8737cfbcacacdc4e4",
+      "89bec831efea2f88129dedcad06bb3fa", "89bec831efea2f88129dedcad06bb3fa",
+      "dead0fe4030085c22e92d16bb110de9d", "306a2f5dfd675df4ed9af44fd5cac8c0",
+      "306a2f5dfd675df4ed9af44fd5cac8c0", "9d01c946a12f5ef9d9cebd9816e06014",
+      "768f63912e43148c13688d7f23281531", "768f63912e43148c13688d7f23281531",
+      "2e7927158e7b8e40e7269fc909fb584b", "123028e18c2bfb334e34adb5a4f67de4",
+      "123028e18c2bfb334e34adb5a4f67de4", "2c979c2bddef79a760e72a802f83cc76",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "da1a6ff2be03ec8acde4cb1cd519a6f0",
+      "a4ca37cb869a0dbd1c4a2dcc449a8f31", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "1b5d1d4c7be8d5ec00a42a49eecf918f", "98b77e88b0784baaea64c98c8707fe46",
+      "8148788044522edc3c497e1017efe2ce", "acf60abeda98bbea161139b915317423",
+      "262c96b1f2c4f85c86c0e9c77fedff1e", "f35a3d13516440f9168076d9b07c9e98",
+      "13782526fc2726100cb3cf375b3150ed", "13c07441b47b0c1ed80f015ac302d220",
+      "02880fde51ac991ad18d8986f4e5145c", "aa25073115bad49432953254e7dce0bc",
+      "69e3361b7199e10e75685b90fb0df623", "2f8ab35f6e7030e82ca922a68b29af4a",
+      "452f91b01833c57db4e909575a029ff6", "1fabf0655bedb671e4d7287fec8119ba",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "d54206c34785cc3d8a06c2ceac46378c",
+      "85a11892ed884e3e74968435f6b16e64", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "16434230d24b9522ae2680e8c37e1b95", "963dea92f3efbb99137d1de9c56728d3",
+      "b72fb6a9a073c2fe65013af1842dc9b0", "86fa0c299737eb499cbcdce94abe2d33",
+      "6b80af04470b83673d98f46925e678a5", "65baca6167fe5249f7a839ce5b2fd591",
+      "e47ded6c0eec1d5baadd02aff172f2b1", "c0950e609f278efb7050d319a9756bb3",
+      "9051290279237f9fb1389989b142d2dd", "34cdc1be291c95981c98812c5c343a15",
+      "5b64a6911cb7c3d60bb8f961ed9782a2", "7133de9d03a4b07716a12226b5e493e8",
+      "3594eff52d5ed875bd9655ddbf106fae", "90d7e13aa2f9a064493ff2b3b5b12109",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "b1f26ee13df2e14a757416ba8a682278",
+      "996b6c166f9ed25bd07ea6acdf7597ff", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "34895d4c69a6c3303693e6f431bcd5d8", "c9497b00cb1bc3363dd126ffdddadc8e",
+      "1e461869bb2ee9b6069c5e52cf817291", "8d7f1d7ea6a0dcc922ad5d2e77bc74dd",
+      "138855d9bf0ccd0c62ac14c7bff4fd37", "64035142864914d05a48ef8e013631d0",
+      "205904fa3c644433b46e01c11dd2fe40", "291425aaf8206b20e88db8ebf3cf7e7f",
+      "cb6238b8eb6b72980958e6fcceb2f2eb", "626321a6dfac542d0fc70321fac13ff3",
+      "1c6fda7501e0f8bdad972f7857cd9354", "4fd485dadcb570e5a0a5addaf9ba84da",
+      "d3f140aea9e8eabf4e1e5190e0148288", "e4938219593bbed5ae638a93f2f4a580",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "238980eebc9e63ae3eea2771c7a70f12",
+      "0eac13431bd7d8a573318408a72246d5", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "73438155feb62595e3e406921102d748", "5871e0e88a776840d619670fbf107858",
+      "1c6376ce55c9ee9e35d432edb1ffb3b7", "d675e0195c9feca956e637f3f1959f40",
+      "b5681673903ade13d69e295f82fdd009", "3c43020105ae93a301404b4cd6238654",
+      "dd2c5880a94ed3758bfea0b0e8c78286", "4ebb1a7b25a39d8b9868ec8a1243103f",
+      "d34ec07845cd8523651e5f5112984a14", "2ce55308d873f4cd244f16da2b06e06e",
+      "a4bb5d5ff4b25f391265b5231049a09a", "c9106e0c820b03bcdde3aa94efc11a3e",
+      "7ec2eae9e118506da8b33440b399511a", "78de867c8ee947ed6d29055747f26949",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "e552466a4e7ff187251b8914b084d404",
+      "981b7c44b6f7b7ac2acf0cc4096e6bf4", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "a4c75372af36162831cb872e24e1088c", "497271227a70a72f9ad25b415d41563f",
+      "c48bd7e11ec44ba7b2bc8b6a04592439", "0960a9af91250e9faa1eaac32227bf6f",
+      "746c2e0f96ae2246d534d67102be068c", "d6f6db079da9b8909a153c07cc9d0e63",
+      "7c8928a0d769f4264d195f39cb68a772", "db645c96fc8be04015e0eb538afec9ae",
+      "946af3a8f5362def5f4e27cb0fd4e754", "7ad78dfe7bbedf696dd58d9ad01bcfba",
+      "f0fd9c09d454e4ce918faa97e9ac10be", "af6ae5c0eb28417bd251184baf2eaba7",
+      "866f8df540dd3b58ab1339314d139cbd", "72803589b453a29501540aeddc23e6f4",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "aba5d5ef5e96fe418e65d20e506ea834",
+      "d70bf16e2a31e90b7b3cdeaef1494cf9", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "6df80bb7f264f4f285d09a4d61533fae", "c8831118d1004a7cca015a4fca140018",
+      "b7f82c140369067c105c7967c75b6f9e", "130f47aae365aabfec4360fa5b5ff554",
+      "92483ed631de21b685ffe6ccadbbec8f", "cbb6ab31547df6b91cfb48630fdffb48",
+      "1eea5e8a24d6aa11778eb3e5e5e9c9f2", "9e193b6b28ce798c44c744efde19eee9",
+      "885c384d90aaa34acd8303958033c252", "8110ed10e7234851dff3c7e4a51108a2",
+      "6fb9383302eb7e7a13387464d2634e03", "864d51fcc737bc73a3f588b67515039a",
+      "2ecb7890f00234bcb28c1d969f489012", "c4793d431dbf2d88826bb440bf027512",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "972aeba65e8a6d20dd0f95279be2aa75",
+      "34165457282e2af2e9b3f5840e4dec5d", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "b8c5582b9bbb789c45471f93be83b41f", "257bf5467db570974d7cf2356bacf116",
+      "5255dded79f56b0078543b5a1814a668", "ef745100f5f34c8ff841b2b0b57eb33f",
+      "edae8ed67286ca6a31573a541b3deb6f", "01adcd8bf15fbf70df47fbf3a953aa14",
+      "ba539808a8501609ce052a1562a62b25", "ac8e6391200cec2abdebb00744a2ba82",
+      "54b17120f7d71ddb4d70590ecd231cc1", "f6e36446a97611a4db4425df926974b2",
+      "a82f4080699300b659bbe1b5c4463147", "ecedb178f7cad3dc1b921eca67f9efb6",
+      "0609ca0ff3ca90069e8b48829b4b0891", "839e86c681e97359f7819c766000dd1c",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+
+const char* GetConvolveScaleDigest10bpp(int id) {
+  // Entries containing 'XXXXX...' are skipped. See the test for details.
+  static const char* const kDigest[ConvolveTestParam::kNumBlockSizes * 2] = {
+      "27e21eb31687f9fbd0a66865fa8d7c8a", "9bff726c8e1d0998451a3b9cf2b3d8c8",
+      "661d74cfef36f12ed8d9b4c3ccb7fe0d", "5fc365fd1fcc9599dd97a885ba0c2eec",
+      "acdba2c82a6268e3c0ae8fc32be1b41f", "a5db60bbeaf56ab030ed21c42d553cf3",
+      "1228bb633f9fd63fdb998b775ca79e98", "07812c97f9f43a2a8ae07329dc488699",
+      "903525fb782119c4dfaf61b98a310c9f", "f38b51cef38b929e317861ccbc73ecd8",
+      "b78b05138e1d5fbf089144c42ce03058", "f2e227664cbf2d821b242a34fcbc9835",
+      "cb992dac70591e7d3663588ae13b9adc", "f2292d33657d939fa85ea5bacdfe39a3",
+      "7049dc742d6d8ad6f5d4309968ff281c", "e4beebde1ac335a4d92e4af94653a2ce",
+      "cc77875f98f54b9b26b5f7d9fcbc828d", "fb623f7b9e1ffcf2ae361599728a5589",
+      "c33847e47a7eda214734084640818df9", "ab3e1aec3d720c0c89c46a8d5b161b44",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX", "efe4de861dcf0f7458b6208cae7e3584",
+      "814751c55fa84f0fed94ff15fc30fc24", "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
+      "31a63fe47297102937acbe7a328588b7", "b804a0a24633243f7da48d7a5f51c0bf",
+      "cb492672b005fc378cccc8c03003cd4a", "1d18732bcf2ea487e84579489cc59a22",
+      "457c4b3ec38a8d6c210584ade1a9fae2", "a3afdd468e6a5238a3dbd2cc21c11c9e",
+      "6ff8a16f21d6e8a9741dacf0734ae563", "3ffa29ef7e54e51f6849c9a3d3c79d03",
+      "af89899b083cf269ac1bd988aeb15b15", "3365d8411c11081fb228436238b9a671",
+      "3ba56d30f5f81d7098f356635a58b9af", "b3013776900c6520bd30f868e8c963b6",
+      "81febaa7342692483040f500ba2e5e2b", "4a51ff1d9a4a68687d590b41aa7835a3",
+  };
+  assert(id >= 0);
+  assert(id < sizeof(kDigest) / sizeof(kDigest[0]));
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct ConvolveTypeParam {
+  ConvolveTypeParam(bool is_intra_block_copy, bool is_compound,
+                    bool has_vertical_filter, bool has_horizontal_filter)
+      : is_intra_block_copy(is_intra_block_copy),
+        is_compound(is_compound),
+        has_vertical_filter(has_vertical_filter),
+        has_horizontal_filter(has_horizontal_filter) {}
+  bool is_intra_block_copy;
+  bool is_compound;
+  bool has_vertical_filter;
+  bool has_horizontal_filter;
+};
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height;
+}
+
+std::ostream& operator<<(std::ostream& os, const ConvolveTypeParam& param) {
+  return os << "is_intra_block_copy: " << param.is_intra_block_copy
+            << ", is_compound: " << param.is_compound
+            << ", has_(vertical/horizontal)_filter: "
+            << param.has_vertical_filter << "/" << param.has_horizontal_filter;
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveTest : public testing::TestWithParam<
+                         std::tuple<ConvolveTypeParam, ConvolveTestParam>> {
+ public:
+  ConvolveTest() = default;
+  ~ConvolveTest() override = default;
+
+  void SetUp() override {
+    ConvolveInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    GetConvolveFunc(dsp, &base_convolve_func_);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_convolve_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        ConvolveInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        ConvolveInit_AVX2();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ConvolveInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    GetConvolveFunc(dsp, &cur_convolve_func_);
+
+    // Skip functions that have not been specialized for this particular
+    // architecture.
+    if (cur_convolve_func_ == base_convolve_func_) {
+      cur_convolve_func_ = nullptr;
+    }
+  }
+
+ protected:
+  int GetDigestId() const {
+    int id = param_.block_size;
+    id += param_.kNumBlockSizes *
+          static_cast<int>(type_param_.has_horizontal_filter);
+    id += 2 * param_.kNumBlockSizes *
+          static_cast<int>(type_param_.has_vertical_filter);
+    id += 4 * param_.kNumBlockSizes * static_cast<int>(type_param_.is_compound);
+    id += 8 * param_.kNumBlockSizes *
+          static_cast<int>(type_param_.is_intra_block_copy);
+    return id;
+  }
+
+  void GetConvolveFunc(const Dsp* dsp, ConvolveFunc* func);
+  void SetInputData(bool use_fixed_values, int value);
+  void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+             libvpx_test::MD5* md5_digest);
+  void Check16Bit(bool use_fixed_values, const uint16_t* src,
+                  const uint16_t* dest, libvpx_test::MD5* md5_digest);
+  // |num_runs| covers the categories of filters (6) and the number of filters
+  // under each category (16).
+  void Test(bool use_fixed_values, int value,
+            int num_runs = kMinimumViableRuns);
+
+  const ConvolveTypeParam type_param_ = std::get<0>(GetParam());
+  const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+  ConvolveFunc base_convolve_func_;
+  ConvolveFunc cur_convolve_func_;
+  // Convolve filters are 7-tap, which need 3 pixels
+  // (kRestorationHorizontalBorder) padding.
+  Pixel source_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+  const int source_stride_ = kMaxBlockWidth;
+  const int source_height_ = kMaxBlockHeight;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::GetConvolveFunc(const Dsp* const dsp,
+                                                    ConvolveFunc* func) {
+  *func =
+      dsp->convolve[type_param_.is_intra_block_copy][type_param_.is_compound]
+                   [type_param_.has_vertical_filter]
+                   [type_param_.has_horizontal_filter];
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                 int value) {
+  if (use_fixed_values) {
+    std::fill(source_, source_ + source_height_ * source_stride_, value);
+  } else {
+    const int offset =
+        kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int height = param_.height;
+    const int width = param_.width;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+      }
+    }
+    // Copy border pixels to the left and right borders.
+    for (int y = 0; y < height; ++y) {
+      Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+             source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+      Memset(&source_[y * source_stride_ + offset + width],
+             source_[y * source_stride_ + offset + width - 1],
+             kConvolveBorderLeftTop);
+    }
+    // Copy border pixels to the top and bottom borders.
+    for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+      memcpy(&source_[y * source_stride_],
+             &source_[kConvolveBorderLeftTop * source_stride_],
+             source_stride_ * sizeof(Pixel));
+      memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+             &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+             source_stride_ * sizeof(Pixel));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+                                          const Pixel* src, const Pixel* dest,
+                                          libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Check16Bit(bool use_fixed_values,
+                                               const uint16_t* src,
+                                               const uint16_t* dest,
+                                               libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveTest<bitdepth, Pixel>::Test(
+    bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+  // There's no meaning testing fixed input in compound convolve.
+  if (type_param_.is_compound && use_fixed_values) return;
+
+  // There should not be any function set for this combination.
+  if (type_param_.is_intra_block_copy && type_param_.is_compound) {
+    ASSERT_EQ(cur_convolve_func_, nullptr);
+    return;
+  }
+
+  // Compound and intra block copy functions are only used for blocks 4x4 or
+  // greater.
+  if (type_param_.is_compound || type_param_.is_intra_block_copy) {
+    if (param_.width < 4 || param_.height < 4) {
+      GTEST_SKIP();
+    }
+  }
+
+  // Skip unspecialized functions.
+  if (cur_convolve_func_ == nullptr) {
+    GTEST_SKIP();
+  }
+
+  SetInputData(use_fixed_values, value);
+  int subpixel_x = 0;
+  int subpixel_y = 0;
+  int vertical_index = 0;
+  int horizontal_index = 0;
+  const int offset =
+      kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + offset;
+  const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+  const ptrdiff_t src_stride_16 = source_stride_;
+  const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+  // Pack Compound output since we control the predictor buffer.
+  const ptrdiff_t dst_stride_compound = param_.width;
+
+  // Output is always 16 bits regardless of |bitdepth|.
+  uint16_t* dst_16 = dest_16bit_ + offset;
+  // Output depends on |bitdepth|.
+  Pixel* dst_pixel = dest_clipped_ + offset;
+
+  // Collect the first |kMinimumViableRuns| into one md5 buffer.
+  libvpx_test::MD5 md5_digest;
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    // Test every filter.
+    // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+    subpixel_x += 1 << 6;
+    subpixel_y += 1 << 6;
+
+    const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+    const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+    // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+    // function.
+    if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+      continue;
+    }
+
+    // For focused speed testing these can be set to the desired filter. Want
+    // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+    vertical_index += static_cast<int>(i % 16 == 0);
+    vertical_index %= 4;
+    horizontal_index += static_cast<int>(i % 16 == 0);
+    horizontal_index %= 4;
+
+    if (type_param_.is_compound) {
+      // Output type is uint16_t.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_16, dst_stride_compound);
+      elapsed_time += absl::Now() - start;
+    } else {
+      // Output type is Pixel.
+      const absl::Time start = absl::Now();
+      cur_convolve_func_(src, src_stride, horizontal_index, vertical_index,
+                         horizontal_filter_id, vertical_filter_id, param_.width,
+                         param_.height, dst_pixel, dst_stride);
+      elapsed_time += absl::Now() - start;
+    }
+
+    // Only check the output for the first set. After that it's just repeated
+    // runs for speed timing.
+    if (i >= kMinimumViableRuns) continue;
+
+    if (type_param_.is_compound) {
+      // Need to copy source to a uint16_t buffer for comparison.
+      Pixel* src_ptr = source_;
+      uint16_t* src_ptr_16 = source_16bit_;
+      for (int y = 0; y < kMaxBlockHeight; ++y) {
+        for (int x = 0; x < kMaxBlockWidth; ++x) {
+          src_ptr_16[x] = src_ptr[x];
+        }
+        src_ptr += src_stride_16;
+        src_ptr_16 += src_stride_16;
+      }
+
+      Check16Bit(use_fixed_values, source_16bit_ + offset, dst_16, &md5_digest);
+    } else {
+      Check(use_fixed_values, src, dst_pixel, &md5_digest);
+    }
+  }
+
+  if (!use_fixed_values) {
+    // md5 sums are only calculated for random input.
+    const char* ref_digest;
+    if (bitdepth == 8) {
+      ref_digest = GetConvolveDigest8bpp(GetDigestId());
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ref_digest = GetConvolveDigest10bpp(GetDigestId());
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+    const char* direction;
+    if (type_param_.has_vertical_filter && type_param_.has_horizontal_filter) {
+      direction = "2D";
+    } else if (type_param_.has_vertical_filter) {
+      direction = "Vertical";
+    } else if (type_param_.has_horizontal_filter) {
+      direction = "Horizontal";
+    } else {
+      direction = "Copy";
+    }
+    const auto elapsed_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+    printf("Mode Convolve%s%s%s[%25s]: %5d us MD5: %s\n",
+           type_param_.is_compound ? "Compound" : "",
+           type_param_.is_intra_block_copy ? "IntraBlockCopy" : "", direction,
+           absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+           elapsed_time_us, md5_digest.Get());
+    EXPECT_STREQ(ref_digest, md5_digest.Get());
+  }
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Convolve process.
+template <int bitdepth>
+void ShowRange() {
+  // Subtract one from the shift bits because the filter is pre-shifted by 1.
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp - 1
+                                      : kInterRoundBitsHorizontal - 1;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp - 1
+                                    : kInterRoundBitsVertical - 1;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical - 1;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_convolve_filter = kHalfSubPixelFilters[2][8];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min, max;
+  ApplyFilterToUnsignedInput(max_input, worst_convolve_filter, &min, &max);
+
+  if (bitdepth == 8) {
+    // 8bpp can use int16_t for sums.
+    assert(min > INT16_MIN);
+    assert(max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  Horizontal upscaled range:         [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  // All bitdepths can use int16_t for first pass output.
+  assert(first_pass_min > INT16_MIN);
+  assert(first_pass_max < INT16_MAX);
+
+  printf("  Horizontal downscaled range:       [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  ApplyFilterToSignedInput(first_pass_min, first_pass_max,
+                           worst_convolve_filter, &min, &max);
+
+  // All bitdepths require int32_t for second pass sums.
+  assert(min < INT16_MIN && min > INT32_MIN);
+  assert(max > INT16_MAX && max < INT32_MAX);
+
+  printf("  Vertical upscaled range:           [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  Pixel output range:                [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  Compound output range:             [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(ConvolveTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using ConvolveTest8bpp = ConvolveTest<8, uint8_t>;
+
+TEST_P(ConvolveTest8bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, 255);
+}
+
+TEST_P(ConvolveTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest8bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+template <int bitdepth, typename Pixel>
+class ConvolveScaleTest
+    : public testing::TestWithParam<
+          std::tuple<bool /*is_compound*/, ConvolveTestParam>> {
+ public:
+  ConvolveScaleTest() = default;
+  ~ConvolveScaleTest() override = default;
+
+  void SetUp() override {
+    ConvolveInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_convolve_scale_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        ConvolveInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        ConvolveInit_AVX2();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ConvolveInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ConvolveInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_convolve_scale_func_ = dsp->convolve_scale[is_compound_];
+
+    // Skip functions that have not been specialized for this particular
+    // architecture.
+    if (cur_convolve_scale_func_ == base_convolve_scale_func_) {
+      cur_convolve_scale_func_ = nullptr;
+    }
+  }
+
+ protected:
+  int GetDigestId() const {
+    return param_.block_size +
+           param_.kNumBlockSizes * static_cast<int>(is_compound_);
+  }
+
+  void SetInputData(bool use_fixed_values, int value);
+  void Check(bool use_fixed_values, const Pixel* src, const Pixel* dest,
+             libvpx_test::MD5* md5_digest);
+  void Check16Bit(bool use_fixed_values, const uint16_t* src,
+                  const uint16_t* dest, libvpx_test::MD5* md5_digest);
+  // |num_runs| covers the categories of filters (6) and the number of filters
+  // under each category (16).
+  void Test(bool use_fixed_values, int value,
+            int num_runs = kMinimumViableRuns);
+
+  const bool is_compound_ = std::get<0>(GetParam());
+  const ConvolveTestParam param_ = std::get<1>(GetParam());
+
+ private:
+  ConvolveScaleFunc base_convolve_scale_func_;
+  ConvolveScaleFunc cur_convolve_scale_func_;
+  // Convolve filters are 7-tap, which need 3 pixels
+  // (kRestorationHorizontalBorder) padding.
+  // The source can be at most 2 times of max width/height.
+  Pixel source_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t source_16bit_[kMaxBlockHeight * kMaxBlockWidth * 4] = {};
+  uint16_t dest_16bit_[kMaxBlockHeight * kMaxBlockWidth] = {};
+  Pixel dest_clipped_[kMaxBlockHeight * kMaxBlockWidth] = {};
+
+  const int source_stride_ = kMaxBlockWidth * 2;
+  const int source_height_ = kMaxBlockHeight * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                      int value) {
+  if (use_fixed_values) {
+    std::fill(source_, source_ + source_height_ * source_stride_, value);
+  } else {
+    const int offset =
+        kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int height = param_.height * 2;
+    const int width = param_.width * 2;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        source_[y * source_stride_ + x + offset] = rnd.Rand16() & mask;
+      }
+    }
+    // Copy border pixels to the left and right borders.
+    for (int y = 0; y < height; ++y) {
+      Memset(&source_[(y + kConvolveBorderLeftTop) * source_stride_],
+             source_[y * source_stride_ + offset], kConvolveBorderLeftTop);
+      Memset(&source_[y * source_stride_ + offset + width],
+             source_[y * source_stride_ + offset + width - 1],
+             kConvolveBorderLeftTop);
+    }
+    // Copy border pixels to the top and bottom borders.
+    for (int y = 0; y < kConvolveBorderLeftTop; ++y) {
+      memcpy(&source_[y * source_stride_],
+             &source_[kConvolveBorderLeftTop * source_stride_],
+             source_stride_ * sizeof(Pixel));
+      memcpy(&source_[(y + kConvolveBorderLeftTop + height) * source_stride_],
+             &source_[(kConvolveBorderLeftTop + height - 1) * source_stride_],
+             source_stride_ * sizeof(Pixel));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check(bool use_fixed_values,
+                                               const Pixel* src,
+                                               const Pixel* dest,
+                                               libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_clipped_) - offset * sizeof(Pixel);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Check16Bit(
+    bool use_fixed_values, const uint16_t* src, const uint16_t* dest,
+    libvpx_test::MD5* md5_digest) {
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    const bool success =
+        test_utils::CompareBlocks(src, dest, param_.width, param_.height,
+                                  kMaxBlockWidth, kMaxBlockWidth, false);
+    EXPECT_TRUE(success);
+  } else {
+    // For random input, compare md5.
+    const int offset =
+        kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+    const size_t size = sizeof(dest_16bit_) - offset * sizeof(uint16_t);
+    md5_digest->Add(reinterpret_cast<const uint8_t*>(dest), size);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ConvolveScaleTest<bitdepth, Pixel>::Test(
+    bool use_fixed_values, int value, int num_runs /*= kMinimumViableRuns*/) {
+  // There's no meaning testing fixed input in compound convolve.
+  if (is_compound_ && use_fixed_values) return;
+
+  // The compound function is only used for blocks 4x4 or greater.
+  if (is_compound_) {
+    if (param_.width < 4 || param_.height < 4) {
+      GTEST_SKIP();
+    }
+  }
+
+  // Skip unspecialized functions.
+  if (cur_convolve_scale_func_ == nullptr) {
+    GTEST_SKIP();
+  }
+
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestId());
+  // [1,2048] for |step_[xy]|. This covers a scaling range of 1/1024 to 2x.
+  const int step_x = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  const int step_y = (rnd.Rand16() & ((1 << 11) - 1)) + 1;
+  int subpixel_x = 0;
+  int subpixel_y = 0;
+  int vertical_index = 0;
+  int horizontal_index = 0;
+  const int offset =
+      kConvolveBorderLeftTop * kMaxBlockWidth + kConvolveBorderLeftTop;
+  const int offset_scale =
+      kConvolveBorderLeftTop * source_stride_ + kConvolveBorderLeftTop;
+  const Pixel* const src_scale = source_ + offset_scale;
+  const ptrdiff_t src_stride = source_stride_ * sizeof(Pixel);
+  const ptrdiff_t dst_stride = kMaxBlockWidth * sizeof(Pixel);
+  // Pack Compound output since we control the predictor buffer.
+  const ptrdiff_t dst_stride_compound = param_.width;
+
+  // Output is always 16 bits regardless of |bitdepth|.
+  uint16_t* dst_16 = dest_16bit_ + offset;
+  // Output depends on |bitdepth|.
+  Pixel* dst_pixel = dest_clipped_ + offset;
+
+  // Collect the first |kMinimumViableRuns| into one md5 buffer.
+  libvpx_test::MD5 md5_digest;
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    // Test every filter.
+    // Because of masking |subpixel_{x,y}| values roll over every 16 iterations.
+    subpixel_x += 1 << 6;
+    subpixel_y += 1 << 6;
+
+    const int horizontal_filter_id = (subpixel_x >> 6) & 0xF;
+    const int vertical_filter_id = (subpixel_y >> 6) & 0xF;
+
+    // |filter_id| == 0 (copy) must be handled by the appropriate 1D or copy
+    // function.
+    if (horizontal_filter_id == 0 || vertical_filter_id == 0) {
+      continue;
+    }
+
+    // For focused speed testing these can be set to the desired filter. Want
+    // only 8 tap filters? Set |{vertical,horizontal}_index| to 2.
+    vertical_index += static_cast<int>(i % 16 == 0);
+    vertical_index %= 4;
+    horizontal_index += static_cast<int>(i % 16 == 0);
+    horizontal_index %= 4;
+
+    // Output type is uint16_t.
+    const absl::Time start = absl::Now();
+    if (is_compound_) {
+      cur_convolve_scale_func_(
+          source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+          step_y, param_.width, param_.height, dst_16, dst_stride_compound);
+    } else {
+      cur_convolve_scale_func_(
+          source_, src_stride, horizontal_index, vertical_index, 0, 0, step_x,
+          step_y, param_.width, param_.height, dst_pixel, dst_stride);
+    }
+    elapsed_time += absl::Now() - start;
+
+    // Only check the output for the first set. After that it's just repeated
+    // runs for speed timing.
+    if (i >= kMinimumViableRuns) continue;
+
+    // Convolve function does not clip the output. The clipping is applied
+    // later, but libaom clips the output. So we apply clipping to match
+    // libaom in tests.
+    if (is_compound_) {
+      const int single_round_offset = (1 << bitdepth) + (1 << (bitdepth - 1));
+      Pixel* dest_row = dest_clipped_;
+      for (int y = 0; y < kMaxBlockHeight; ++y) {
+        for (int x = 0; x < kMaxBlockWidth; ++x) {
+          dest_row[x] = static_cast<Pixel>(Clip3(
+              dest_16bit_[y * dst_stride_compound + x] - single_round_offset, 0,
+              (1 << bitdepth) - 1));
+        }
+        dest_row += kMaxBlockWidth;
+      }
+    }
+
+    if (is_compound_) {
+      Check16Bit(use_fixed_values, source_16bit_ + offset_scale, dst_16,
+                 &md5_digest);
+    } else {
+      Check(use_fixed_values, src_scale, dst_pixel, &md5_digest);
+    }
+  }
+
+  if (!use_fixed_values) {
+    // md5 sums are only calculated for random input.
+    const char* ref_digest;
+    if (bitdepth == 8) {
+      ref_digest = GetConvolveScaleDigest8bpp(GetDigestId());
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      ref_digest = GetConvolveScaleDigest10bpp(GetDigestId());
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+
+    const auto elapsed_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+    printf("Mode Convolve%sScale2D[%25s]: %5d us MD5: %s\n",
+           is_compound_ ? "Compound" : "",
+           absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+           elapsed_time_us, md5_digest.Get());
+    EXPECT_STREQ(ref_digest, md5_digest.Get());
+  }
+}
+
+using ConvolveScaleTest8bpp = ConvolveScaleTest<8, uint8_t>;
+
+TEST_P(ConvolveScaleTest8bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, 255);
+}
+
+TEST_P(ConvolveScaleTest8bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest8bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+//------------------------------------------------------------------------------
+const ConvolveTestParam kConvolveParam[] = {
+    ConvolveTestParam(ConvolveTestParam::kBlockSize2x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize2x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize4x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x2),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x4),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize8x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x8),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize16x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x16),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize32x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x32),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize64x128),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize128x64),
+    ConvolveTestParam(ConvolveTestParam::kBlockSize128x128),
+};
+
+const ConvolveTypeParam kConvolveTypeParam[] = {
+    ConvolveTypeParam(false, false, false, false),
+    ConvolveTypeParam(false, false, false, true),
+    ConvolveTypeParam(false, false, true, false),
+    ConvolveTypeParam(false, false, true, true),
+    ConvolveTypeParam(false, true, false, false),
+    ConvolveTypeParam(false, true, false, true),
+    ConvolveTypeParam(false, true, true, false),
+    ConvolveTypeParam(false, true, true, true),
+    ConvolveTypeParam(true, false, false, false),
+    ConvolveTypeParam(true, false, false, true),
+    ConvolveTypeParam(true, false, true, false),
+    ConvolveTypeParam(true, false, true, true),
+    // This is left to ensure no function exists for |intra_block_copy| when
+    // |is_compound| is true; all combinations aren't necessary.
+    ConvolveTypeParam(true, true, false, false),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(SSE41, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest8bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveScaleTest8bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_AVX2
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConvolveTest10bpp = ConvolveTest<10, uint16_t>;
+
+TEST_P(ConvolveTest10bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveTest10bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+using ConvolveScaleTest10bpp = ConvolveScaleTest<10, uint16_t>;
+
+TEST_P(ConvolveScaleTest10bpp, FixedValues) {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << 10) - 1);
+}
+
+TEST_P(ConvolveScaleTest10bpp, RandomValues) { Test(false, 0); }
+
+TEST_P(ConvolveScaleTest10bpp, DISABLED_Speed) {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveScaleTest10bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest10bpp,
+                         testing::Combine(testing::ValuesIn(kConvolveTypeParam),
+                                          testing::ValuesIn(kConvolveParam)));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveScaleTest10bpp,
+                         testing::Combine(testing::Bool(),
+                                          testing::ValuesIn(kConvolveParam)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc
new file mode 100644
index 0000000..34d10fc
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.cc
@@ -0,0 +1,103 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                             const void* LIBGAV1_RESTRICT prediction_1,
+                             const uint8_t weight_0, const uint8_t weight_1,
+                             const int width, const int height,
+                             void* LIBGAV1_RESTRICT const dest,
+                             const ptrdiff_t dest_stride) {
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      // See warp.cc and convolve.cc for detailed prediction ranges.
+      // weight_0 + weight_1 = 16.
+      int res = pred_0[x] * weight_0 + pred_1[x] * weight_1;
+      res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16;
+      dst[x] = static_cast<Pixel>(
+          Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0,
+                (1 << bitdepth) - 1));
+    } while (++x < width);
+
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+  dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void DistanceWeightedBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/distance_weighted_blend.h b/src/dsp/distance_weighted_blend.h
new file mode 100644
index 0000000..1a782b6
--- /dev/null
+++ b/src/dsp/distance_weighted_blend.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/distance_weighted_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/distance_weighted_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_
diff --git a/src/dsp/distance_weighted_blend_test.cc b/src/dsp/distance_weighted_blend_test.cc
new file mode 100644
index 0000000..fdf058e
--- /dev/null
+++ b/src/dsp/distance_weighted_blend_test.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+
+#include <cassert>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 500000;
+
+constexpr int kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+template <int bitdepth, typename Pixel>
+class DistanceWeightedBlendTest : public testing::TestWithParam<BlockSize>,
+                                  public test_utils::MaxAlignedAllocable {
+ public:
+  DistanceWeightedBlendTest() = default;
+  ~DistanceWeightedBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    DistanceWeightedBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_func_ = dsp->distance_weighted_blend;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        DistanceWeightedBlendInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      DistanceWeightedBlendInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->distance_weighted_blend;
+  }
+
+ protected:
+  void Test(const char* digest, int num_tests);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels;
+  const int width_ = kBlockWidthPixels[GetParam()];
+  const int height_ = kBlockHeightPixels[GetParam()];
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  Pixel dest_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  Pixel reference_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] =
+      {};
+  dsp::DistanceWeightedBlendFunc base_func_;
+  dsp::DistanceWeightedBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void DistanceWeightedBlendTest<bitdepth, Pixel>::Test(const char* digest,
+                                                      int num_tests) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  PredType* src_1 = source1_;
+  PredType* src_2 = source2_;
+
+  const int index = rnd.Rand8() & 3;
+  const uint8_t weight_0 = kQuantizedDistanceLookup[index][0];
+  const uint8_t weight_1 = kQuantizedDistanceLookup[index][1];
+  // In libgav1, predictors have an offset which are later subtracted and
+  // clipped in distance weighted blending. Therefore we add the offset
+  // here to match libaom's implementation.
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      // distance_weighted_blend is applied to compound prediction values. This
+      // implies a range far exceeding that of pixel values.
+      // The ranges include kCompoundOffset in 10bpp and 12bpp.
+      // see: src/dsp/convolve.cc & src/dsp/warp.cc.
+      static constexpr int kCompoundPredictionRange[3][2] = {
+          // 8bpp
+          {-5132, 9212},
+          // 10bpp
+          {3988, 61532},
+          // 12bpp
+          {3974, 61559},
+      };
+      constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+      const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+      const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+      src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+    }
+    src_1 += width_;
+    src_2 += width_;
+  }
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_tests; ++i) {
+    const absl::Time start = absl::Now();
+    func_(source1_, source2_, weight_0, weight_1, width_, height_, dest_,
+          sizeof(Pixel) * kDestStride);
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest("DistanceWeightedBlend", ToString(GetParam()),
+                             digest, dest_, sizeof(dest_), elapsed_time);
+}
+
+const BlockSize kTestParam[] = {
+    kBlock4x4,    kBlock4x8,     kBlock4x16,  kBlock8x4,   kBlock8x8,
+    kBlock8x16,   kBlock8x32,    kBlock16x4,  kBlock16x8,  kBlock16x16,
+    kBlock16x32,  kBlock16x64,   kBlock32x8,  kBlock32x16, kBlock32x32,
+    kBlock32x64,  kBlock64x16,   kBlock64x32, kBlock64x64, kBlock64x128,
+    kBlock128x64, kBlock128x128,
+};
+
+const char* GetDistanceWeightedBlendDigest8bpp(const BlockSize block_size) {
+  static const char* const kDigests[kMaxBlockSizes] = {
+      // 4xN
+      "ebf389f724f8ab46a2cac895e4e073ca",
+      "09acd567b6b12c8cf8eb51d8b86eb4bf",
+      "57bb4d65695d8ec6752f2bd8686b64fd",
+      // 8xN
+      "270905ac76f9a2cba8a552eb0bf7c8c1",
+      "f0801c8574d2c271ef2bbea77a1d7352",
+      "e761b580e3312be33a227492a233ce72",
+      "ff214dab1a7e98e2285961d6421720c6",
+      // 16xN
+      "4f712609a36e817f9752326d58562ff8",
+      "14243f5c5f7c7104160c1f2cef0a0fbc",
+      "3ac3f3161b7c8dd8436b02abfdde104a",
+      "81a00b704e0e41a5dbe6436ac70c098d",
+      "af8fd02017c7acdff788be742d700baa",
+      // 32xN
+      "ee34332c66a6d6ed8ce64031aafe776c",
+      "b5e3d22bd2dbdb624c8b86a1afb5ce6d",
+      "607ffc22098d81b7e37a7bf62f4af5d3",
+      "3823dbf043b4682f56d5ca698e755ea5",
+      // 64xN
+      "4acf556b921956c2bc24659cd5128401",
+      "a298c544c9c3b27924b4c23cc687ea5a",
+      "539e2df267782ce61c70103b23b7d922",
+      "3b0cb2a0b5d384efee4d81401025bec1",
+      // 128xN
+      "8b56b636dd712c2f8d138badb7219991",
+      "8cfc8836908902b8f915639b7bff45b3",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest8bpp = DistanceWeightedBlendTest<8, uint8_t>;
+
+TEST_P(DistanceWeightedBlendTest8bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest8bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest8bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDistanceWeightedBlendDigest10bpp(const BlockSize block_size) {
+  static const char* const kDigests[] = {
+      // 4xN
+      "55f594b56e16d5c401274affebbcc3d3",
+      "69df14da4bb33a8f7d7087921008e919",
+      "1b61f33604c54015794198a13bfebf46",
+      // 8xN
+      "825a938185b152f7cf09bf1c0723ce2b",
+      "85ea315c51d979bc9b45834d6b40ec6f",
+      "92ebde208e8c39f7ec6de2de82182dbb",
+      "520f84716db5b43684dbb703806383fe",
+      // 16xN
+      "12ca23e3e2930005a0511646e8c83da4",
+      "6208694a6744f4a3906f58c1add670e3",
+      "a33d63889df989a3bbf84ff236614267",
+      "34830846ecb0572a98bbd192fed02b16",
+      "34bb2f79c0bd7f9a80691b8af597f2a8",
+      // 32xN
+      "fa97f2d0e3143f1f44d3ac018b0d696d",
+      "3df4a22456c9ab6ed346ab1b9750ae7d",
+      "6276a058b35c6131bc0c94a4b4a37ebc",
+      "9ca42da5d2d5eb339df03ae2c7a26914",
+      // 64xN
+      "800e692c520f99223bc24c1ac95a0166",
+      "818b6d20426585ef7fe844015a03aaf5",
+      "fb48691ccfff083e01d74826e88e613f",
+      "0bd350bc5bc604a224d77a5f5a422698",
+      // 128xN
+      "a130840813cd6bd69d09bcf5f8d0180f",
+      "6ece1846bea55e8f8f2ed7fbf73718de",
+  };
+  assert(block_size < kMaxBlockSizes);
+  return kDigests[block_size];
+}
+
+using DistanceWeightedBlendTest10bpp = DistanceWeightedBlendTest<10, uint16_t>;
+
+TEST_P(DistanceWeightedBlendTest10bpp, Blending) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), 1);
+}
+
+TEST_P(DistanceWeightedBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDistanceWeightedBlendDigest10bpp(GetParam()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DistanceWeightedBlendTest10bpp,
+                         testing::ValuesIn(kTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const BlockSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
new file mode 100644
index 0000000..aac0ca0
--- /dev/null
+++ b/src/dsp/dsp.cc
@@ -0,0 +1,171 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+
+#include "src/dsp/average_blend.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/convolve.h"
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/dsp/film_grain.h"
+#include "src/dsp/intra_edge.h"
+#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/dsp/loop_filter.h"
+#include "src/dsp/loop_restoration.h"
+#include "src/dsp/mask_blend.h"
+#include "src/dsp/motion_field_projection.h"
+#include "src/dsp/motion_vector_search.h"
+#include "src/dsp/obmc.h"
+#include "src/dsp/super_res.h"
+#include "src/dsp/warp.h"
+#include "src/dsp/weight_mask.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp_internal {
+
+void DspInit_C() {
+  dsp::AverageBlendInit_C();
+  dsp::CdefInit_C();
+  dsp::ConvolveInit_C();
+  dsp::DistanceWeightedBlendInit_C();
+  dsp::FilmGrainInit_C();
+  dsp::IntraEdgeInit_C();
+  dsp::IntraPredCflInit_C();
+  dsp::IntraPredDirectionalInit_C();
+  dsp::IntraPredFilterInit_C();
+  dsp::IntraPredInit_C();
+  dsp::IntraPredSmoothInit_C();
+  dsp::InverseTransformInit_C();
+  dsp::LoopFilterInit_C();
+  dsp::LoopRestorationInit_C();
+  dsp::MaskBlendInit_C();
+  dsp::MotionFieldProjectionInit_C();
+  dsp::MotionVectorSearchInit_C();
+  dsp::ObmcInit_C();
+  dsp::SuperResInit_C();
+  dsp::WarpInit_C();
+  dsp::WeightMaskInit_C();
+}
+
+dsp::Dsp* GetWritableDspTable(int bitdepth) {
+  switch (bitdepth) {
+    case 8: {
+      static dsp::Dsp dsp_8bpp;
+      return &dsp_8bpp;
+    }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10: {
+      static dsp::Dsp dsp_10bpp;
+      return &dsp_10bpp;
+    }
+#endif
+  }
+  return nullptr;
+}
+
+}  // namespace dsp_internal
+
+namespace dsp {
+
+void DspInit() {
+  static std::once_flag once;
+  std::call_once(once, []() {
+    dsp_internal::DspInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+    const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
+    if ((cpu_features & kSSE4_1) != 0) {
+      AverageBlendInit_SSE4_1();
+      CdefInit_SSE4_1();
+      ConvolveInit_SSE4_1();
+      DistanceWeightedBlendInit_SSE4_1();
+      FilmGrainInit_SSE4_1();
+      IntraEdgeInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredDirectionalInit_SSE4_1();
+      IntraPredFilterInit_SSE4_1();
+      IntraPredInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredSmoothInit_SSE4_1();
+      InverseTransformInit_SSE4_1();
+      LoopFilterInit_SSE4_1();
+      LoopRestorationInit_SSE4_1();
+      MaskBlendInit_SSE4_1();
+      MotionFieldProjectionInit_SSE4_1();
+      MotionVectorSearchInit_SSE4_1();
+      ObmcInit_SSE4_1();
+      SuperResInit_SSE4_1();
+      WarpInit_SSE4_1();
+      WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+    if ((cpu_features & kAVX2) != 0) {
+      CdefInit_AVX2();
+      ConvolveInit_AVX2();
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_AVX2
+#endif  // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
+#if LIBGAV1_ENABLE_NEON
+    AverageBlendInit_NEON();
+    CdefInit_NEON();
+    ConvolveInit_NEON();
+    DistanceWeightedBlendInit_NEON();
+    FilmGrainInit_NEON();
+    IntraEdgeInit_NEON();
+    IntraPredCflInit_NEON();
+    IntraPredDirectionalInit_NEON();
+    IntraPredFilterInit_NEON();
+    IntraPredInit_NEON();
+    IntraPredSmoothInit_NEON();
+    InverseTransformInit_NEON();
+    LoopFilterInit_NEON();
+    LoopRestorationInit_NEON();
+    MaskBlendInit_NEON();
+    MotionFieldProjectionInit_NEON();
+    MotionVectorSearchInit_NEON();
+    ObmcInit_NEON();
+    SuperResInit_NEON();
+    WarpInit_NEON();
+    WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    ConvolveInit10bpp_NEON();
+    InverseTransformInit10bpp_NEON();
+    LoopRestorationInit10bpp_NEON();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#endif  // LIBGAV1_ENABLE_NEON
+  });
+}
+
+const Dsp* GetDspTable(int bitdepth) {
+  return dsp_internal::GetWritableDspTable(bitdepth);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
new file mode 100644
index 0000000..f9e6b22
--- /dev/null
+++ b/src/dsp/dsp.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_DSP_H_
+#define LIBGAV1_SRC_DSP_DSP_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+
+#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
+#endif
+
+enum IntraPredictor : uint8_t {
+  kIntraPredictorDcFill,
+  kIntraPredictorDcTop,
+  kIntraPredictorDcLeft,
+  kIntraPredictorDc,
+  kIntraPredictorVertical,
+  kIntraPredictorHorizontal,
+  kIntraPredictorPaeth,
+  kIntraPredictorSmooth,
+  kIntraPredictorSmoothVertical,
+  kIntraPredictorSmoothHorizontal,
+  kNumIntraPredictors
+};
+
+// List of valid 1D transforms.
+enum Transform1d : uint8_t {
+  kTransform1dDct,   // Discrete Cosine Transform.
+  kTransform1dAdst,  // Asymmetric Discrete Sine Transform.
+  kTransform1dIdentity,
+  kTransform1dWht,  // Walsh Hadamard Transform.
+  kNumTransform1ds
+};
+
+// List of valid 1D transform sizes. Not all transforms may be available for all
+// the sizes.
+enum Transform1dSize : uint8_t {
+  kTransform1dSize4,
+  kTransform1dSize8,
+  kTransform1dSize16,
+  kTransform1dSize32,
+  kTransform1dSize64,
+  kNumTransform1dSizes
+};
+
+// The maximum width of the loop filter, fewer pixels may be filtered depending
+// on strength thresholds.
+enum LoopFilterSize : uint8_t {
+  kLoopFilterSize4,
+  kLoopFilterSize6,
+  kLoopFilterSize8,
+  kLoopFilterSize14,
+  kNumLoopFilterSizes
+};
+
+enum : uint8_t {
+  kRow = 0,
+  kColumn = 1,
+};
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const IntraPredictor predictor) {
+  switch (predictor) {
+    case kIntraPredictorDcFill:
+      return "kIntraPredictorDcFill";
+    case kIntraPredictorDcTop:
+      return "kIntraPredictorDcTop";
+    case kIntraPredictorDcLeft:
+      return "kIntraPredictorDcLeft";
+    case kIntraPredictorDc:
+      return "kIntraPredictorDc";
+    case kIntraPredictorVertical:
+      return "kIntraPredictorVertical";
+    case kIntraPredictorHorizontal:
+      return "kIntraPredictorHorizontal";
+    case kIntraPredictorPaeth:
+      return "kIntraPredictorPaeth";
+    case kIntraPredictorSmooth:
+      return "kIntraPredictorSmooth";
+    case kIntraPredictorSmoothVertical:
+      return "kIntraPredictorSmoothVertical";
+    case kIntraPredictorSmoothHorizontal:
+      return "kIntraPredictorSmoothHorizontal";
+    case kNumIntraPredictors:
+      return "kNumIntraPredictors";
+  }
+  abort();
+}
+
+inline const char* ToString(const Transform1d transform) {
+  switch (transform) {
+    case kTransform1dDct:
+      return "kTransform1dDct";
+    case kTransform1dAdst:
+      return "kTransform1dAdst";
+    case kTransform1dIdentity:
+      return "kTransform1dIdentity";
+    case kTransform1dWht:
+      return "kTransform1dWht";
+    case kNumTransform1ds:
+      return "kNumTransform1ds";
+  }
+  abort();
+}
+
+inline const char* ToString(const Transform1dSize transform_size) {
+  switch (transform_size) {
+    case kTransform1dSize4:
+      return "kTransform1dSize4";
+    case kTransform1dSize8:
+      return "kTransform1dSize8";
+    case kTransform1dSize16:
+      return "kTransform1dSize16";
+    case kTransform1dSize32:
+      return "kTransform1dSize32";
+    case kTransform1dSize64:
+      return "kTransform1dSize64";
+    case kNumTransform1dSizes:
+      return "kNumTransform1dSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterSize filter_size) {
+  switch (filter_size) {
+    case kLoopFilterSize4:
+      return "kLoopFilterSize4";
+    case kLoopFilterSize6:
+      return "kLoopFilterSize6";
+    case kLoopFilterSize8:
+      return "kLoopFilterSize8";
+    case kLoopFilterSize14:
+      return "kLoopFilterSize14";
+    case kNumLoopFilterSizes:
+      return "kNumLoopFilterSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopFilterType filter_type) {
+  switch (filter_type) {
+    case kLoopFilterTypeVertical:
+      return "kLoopFilterTypeVertical";
+    case kLoopFilterTypeHorizontal:
+      return "kLoopFilterTypeHorizontal";
+    case kNumLoopFilterTypes:
+      return "kNumLoopFilterTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+// Intra predictors. Section 7.11.2.
+// These require access to one or both of the top row and left column. Some may
+// access the top-left (top[-1]), top-right (top[width+N]), bottom-left
+// (left[height+N]) or upper-left (left[-1]).
+
+// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
+// 7.11.2.5, 7.11.2.6.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. top-left and bottom-left may be accessed.
+// The pointer arguments do not alias one another.
+using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                    const void* top, const void* left);
+using IntraPredictorFuncs =
+    IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
+
+// Directional intra predictor function signature, zone 1 (0 < angle < 90).
+// Section 7.11.2.4 (#7).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |width| and |height| give the dimensions of the block.
+// |xstep| is the scaled starting index to |top| from
+// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
+// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. top-right
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* top, int width,
+                                                    int height, int xstep,
+                                                    bool upsampled_top);
+
+// Directional intra predictor function signature, zone 2 (90 < angle < 180).
+// Section 7.11.2.4 (#8).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left of
+// |dst|. |width| and |height| give the dimensions of the block. |xstep| and
+// |ystep| are the scaled starting index to |top| and |left|, respectively,
+// from kDirectionalIntraPredictorDerivative. |upsampled_top| and
+// |upsampled_left| indicate whether |top| and |left| have been upsampled as
+// described in '7.11.2.11. Intra edge upsample process'. This can occur in
+// cases with |width| + |height| <= 16. top-left and upper-left are accessed,
+// up to [-2] in each if |upsampled_top/left| are set.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone2Func = void (*)(
+    void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
+    int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
+
+// Directional intra predictor function signature, zone 3 (180 < angle < 270).
+// Section 7.11.2.4 (#9).
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
+// column to the left of |dst|. |width| and |height| give the dimensions of the
+// block. |ystep| is the scaled starting index to |left| from
+// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
+// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
+// process'. This can occur in cases with |width| + |height| <= 16. bottom-left
+// is accessed.
+// The pointer arguments do not alias one another.
+using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
+                                                    const void* left, int width,
+                                                    int height, int ystep,
+                                                    bool upsampled_left);
+
+// Filter intra predictor function signature. Section 7.11.2.3.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
+// the row above |dst|. |left| is an aligned vector of the column to the left
+// of |dst|. |width| and |height| are the size of the block in pixels.
+// The pointer arguments do not alias one another.
+using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
+                                          const void* top, const void* left,
+                                          FilterIntraPredictor pred, int width,
+                                          int height);
+
+//------------------------------------------------------------------------------
+// Chroma from Luma (Cfl) prediction. Section 7.11.5.
+
+// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
+// unaligned pointer to the output block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
+// fractional bits of precision. |alpha| is the signed Cfl alpha value for the
+// appropriate plane.
+using CflIntraPredictorFunc = void (*)(
+    void* dst, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
+using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
+
+// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
+// pointer to the output block. |src| is an unaligned pointer to the input
+// block. Pixel size is determined by bitdepth with |stride| given in bytes.
+using CflSubsamplerFunc =
+    void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+             int max_luma_width, int max_luma_height, const void* source,
+             ptrdiff_t stride);
+using CflSubsamplerFuncs =
+    CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
+
+//------------------------------------------------------------------------------
+// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
+
+// Intra edge filter function signature. |buffer| is a pointer to the top_row or
+// left_column that needs to be filtered. Typically the -1'th index of |top_row|
+// and |left_column| need to be filtered as well, so the caller can merely pass
+// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be filtered. |strength| is the
+// filter strength. Section 7.11.2.12 in the spec.
+using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
+
+// Intra edge upsampler function signature. |buffer| is a pointer to the top_row
+// or left_column that needs to be upsampled. Pixel size is determined by
+// bitdepth. |size| is the number of pixels to be upsampled; valid values are:
+// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
+// the |buffer|. Section 7.11.2.11 in the spec.
+using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
+
+//------------------------------------------------------------------------------
+// Inverse transform add function signature.
+//
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the destination frame
+// for the transform type and block size |tx_size| starting at position
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D of Pixel
+// values. |adjusted_tx_height| is the number of rows to process based on the
+// non-zero coefficient count in the block. It will be 1 (non-zero coefficient
+// count == 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less. |src_buffer| is a pointer to an Array2D of Residual
+// values. On input |src_buffer| contains the dequantized values, on output it
+// contains the residual.
+// The pointer arguments do not alias one another.
+using InverseTransformAddFunc = void (*)(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
+using InverseTransformAddFuncs =
+    InverseTransformAddFunc[kNumTransform1ds][kNumTransform1dSizes][2];
+
+//------------------------------------------------------------------------------
+// Post processing.
+
+// Loop filter function signature. Section 7.14.
+// |dst| is an unaligned pointer to the output block. Pixel size is determined
+// by bitdepth with |stride| given in bytes.
+// <threshold param> <spec name> <range>
+// |outer_thresh|    blimit      [7, 193]
+// |inner_thresh|    limit       [1, 63]
+// |hev_thresh|      thresh      [0, 63]
+// These are scaled by the implementation by 'bitdepth - 8' to produce
+// the spec variables blimitBd, limitBd and threshBd.
+// Note these functions are not called when the loop filter level is 0.
+using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
+                                int inner_thresh, int hev_thresh);
+using LoopFilterFuncs =
+    LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
+
+// Cdef direction function signature. Section 7.15.2.
+// |src| is a pointer to the source block. Pixel size is determined by bitdepth
+// with |stride| given in bytes. |direction| and |variance| are output
+// parameters and must not be nullptr.
+// The pointer arguments do not alias one another.
+using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
+                                   uint8_t* direction, int* variance);
+
+// Cdef filtering function signature. Section 7.15.3.
+// |source| is a pointer to the input block padded with kCdefLargeValue if at a
+// frame border. |source_stride| is given in units of uint16_t.
+// |block_width|, |block_height| are the width/height of the input block.
+// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
+// parameters.
+// |direction| is the filtering direction.
+// |dest| is the output buffer. |dest_stride| is given in bytes.
+// The pointer arguments do not alias one another.
+using CdefFilteringFunc = void (*)(const uint16_t* source,
+                                   ptrdiff_t source_stride, int block_height,
+                                   int primary_strength, int secondary_strength,
+                                   int damping, int direction, void* dest,
+                                   ptrdiff_t dest_stride);
+
+// The first index is block width: [0]: 4, [1]: 8. The second is based on
+// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
+// |primary_strength| only, [2]: |secondary_strength| only.
+using CdefFilteringFuncs = CdefFilteringFunc[2][3];
+
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+                                          int initial_subpixel_x, int step,
+                                          void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
+// |dest| is the output buffer.
+// |dest_stride| is given in pixels.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+// The pointer arguments do not alias one another.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+                              ptrdiff_t source_stride, int height,
+                              int downscaled_width, int upscaled_width,
+                              int initial_subpixel_x, int step, void* dest,
+                              ptrdiff_t dest_stride);
+
+// Loop restoration function signature. Sections 7.16, 7.17.
+// |restoration_info| contains loop restoration information, such as filter
+// type, strength.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
+// The pointer arguments do not alias one another.
+using LoopRestorationFunc = void (*)(
+    const RestorationUnitInfo& restoration_info, const void* source,
+    ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+    const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+    int height, RestorationBuffer* restoration_buffer, void* dest);
+
+// Index 0 is Wiener Filter.
+// Index 1 is Self Guided Restoration Filter.
+// This can be accessed as LoopRestorationType - 2.
+using LoopRestorationFuncs = LoopRestorationFunc[2];
+
+// Convolve function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
+                              int horizontal_filter_index,
+                              int vertical_filter_index,
+                              int horizontal_filter_id, int vertical_filter_id,
+                              int width, int height, void* prediction,
+                              ptrdiff_t pred_stride);
+
+// Convolve functions signature. Each points to one convolve function with
+// a specific setting:
+// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
+// [has_horizontal_filter].
+// If is_compound is false, the prediction is clipped to Pixel.
+// If is_compound is true, the range of prediction is:
+//   8bpp:  [-5132,  9212] (int16_t)
+//   10bpp: [ 3988, 61532] (uint16_t)
+//   12bpp: [ 3974, 61559] (uint16_t)
+// See src/dsp/convolve.cc
+using ConvolveFuncs = ConvolveFunc[2][2][2][2];
+
+// Convolve + scale function signature. Section 7.11.3.4.
+// This function applies a horizontal filter followed by a vertical filter.
+// |reference| is the input block (reference frame buffer). |reference_stride|
+// is the corresponding frame stride.
+// |vertical_filter_index|/|horizontal_filter_index| is the index to
+// retrieve the type of filter to be applied for vertical/horizontal direction
+// from the filter lookup table 'kSubPixelFilters'.
+// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
+// |width| and |height| are width and height of the block to be filtered.
+// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
+// x/y direction.
+// |prediction| is the output block (output frame buffer).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
+// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
+// be used.
+// The pointer arguments do not alias one another.
+using ConvolveScaleFunc = void (*)(const void* reference,
+                                   ptrdiff_t reference_stride,
+                                   int horizontal_filter_index,
+                                   int vertical_filter_index, int subpixel_x,
+                                   int subpixel_y, int step_x, int step_y,
+                                   int width, int height, void* prediction,
+                                   ptrdiff_t pred_stride);
+
+// Convolve functions signature for scaling version.
+// 0: single predictor. 1: compound predictor.
+using ConvolveScaleFuncs = ConvolveScaleFunc[2];
+
+// Weight mask function signature. Section 7.11.3.12.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the prediction width and height.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |mask| is the output buffer. |mask_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using WeightMaskFunc = void (*)(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride);
+
+// Weight mask functions signature. The dimensions (in order) are:
+//   * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
+//   * mask_is_inverse.
+using WeightMaskFuncs = WeightMaskFunc[6][6][2];
+
+// Average blending function signature.
+// Two predictors are averaged to generate the output.
+// Input predictor values are int16_t. Output type is uint8_t, with actual
+// range of Pixel value.
+// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using AverageBlendFunc = void (*)(const void* prediction_0,
+                                  const void* prediction_1, int width,
+                                  int height, void* dest,
+                                  ptrdiff_t dest_stride);
+
+// Distance weighted blending function signature.
+// Weights are generated in Section 7.11.3.15.
+// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
+// This function takes two blocks (inter frame prediction) and produces a
+// weighted output.
+// |prediction_0| is the first input block.
+// |prediction_1| is the second input block. Both blocks are int16_t* when
+// bitdepth == 8 and uint16_t* otherwise.
+// |weight_0| is the weight for the first block. It is derived from the relative
+// distance of the first reference frame and the current frame.
+// |weight_1| is the weight for the second block. It is derived from the
+// relative distance of the second reference frame and the current frame.
+// |width| and |height| are the same for the first and second input blocks.
+// The stride for the input buffers is equal to |width|.
+// The valid range of block size is [8x8, 128x128] for the luma plane.
+// |dest| is the output buffer. |dest_stride| is the output buffer stride.
+// The pointer arguments do not alias one another.
+using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
+                                           const void* prediction_1,
+                                           uint8_t weight_0, uint8_t weight_1,
+                                           int width, int height, void* dest,
+                                           ptrdiff_t dest_stride);
+
+// Mask blending function signature. Section 7.11.3.14.
+// This function takes two blocks and produces a blended output stored into the
+// output block |dest|. The blending is a weighted average process, controlled
+// by values of the mask.
+// |prediction_0| is the first input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the inter frame prediction. It is
+// int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// The stride for |prediction_0| is equal to |width|.
+// |prediction_1| is the second input block. When prediction mode is inter_intra
+// (or wedge_inter_intra), this refers to the intra frame prediction and uses
+// Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
+// It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
+// |prediction_stride_1| is the stride, given in units of [u]int16_t. When
+// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
+// equal to |width|.
+// |mask| is an integer array, whose value indicates the weight of the blending.
+// |mask_stride| is corresponding stride.
+// |width|, |height| are the same for both input blocks.
+// If it's inter_intra (or wedge_inter_intra), the valid range of block size is
+// [8x8, 32x32], no 4:1/1:4 blocks (Section 5.11.28). Otherwise (including
+// difference weighted prediction and compound average prediction), the valid
+// range is [8x8, 128x128].
+// If there's subsampling, the corresponding width and height are halved for
+// chroma planes.
+// |is_inter_intra| stands for the prediction mode. If it is true, one of the
+// prediction blocks is from intra prediction of current frame. Otherwise, two
+// prediction blocks are both inter frame predictions.
+// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
+// |dest| is the output block.
+// |dest_stride| is the corresponding stride for dest.
+// The pointer arguments do not alias one another.
+using MaskBlendFunc = void (*)(const void* prediction_0,
+                               const void* prediction_1,
+                               ptrdiff_t prediction_stride_1,
+                               const uint8_t* mask, ptrdiff_t mask_stride,
+                               int width, int height, void* dest,
+                               ptrdiff_t dest_stride);
+
+// Mask blending functions signature. Each points to one function with
+// a specific setting:
+// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
+using MaskBlendFuncs = MaskBlendFunc[3][2];
+
+// This function is similar to the MaskBlendFunc. It is only used when
+// |is_inter_intra| is true and |bitdepth| == 8.
+// |prediction_[01]| are Pixel values (uint8_t).
+// |prediction_1| is also the output buffer.
+// The pointer arguments do not alias one another.
+using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
+                                             uint8_t* prediction_1,
+                                             ptrdiff_t prediction_stride_1,
+                                             const uint8_t* mask,
+                                             ptrdiff_t mask_stride, int width,
+                                             int height);
+
+// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
+// is false, the function at index 0 must be used. Otherwise, the function at
+// index subsampling_x + subsampling_y must be used.
+using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
+
+// Obmc (overlapped block motion compensation) blending function signature.
+// Section 7.11.3.10.
+// This function takes two blocks and produces a blended output stored into the
+// first input block. The blending is a weighted average process, controlled by
+// values of the mask.
+// Obmc is not a compound mode. It is different from other compound blending,
+// in terms of precision. The current block is computed using convolution with
+// clipping to the range of pixel values. Its above and left blocks are also
+// clipped. Therefore obmc blending process doesn't need to clip the output.
+// |prediction| is the first input block, which will be overwritten.
+// |prediction_stride| is the stride, given in bytes.
+// |width|, |height| are the same for both input blocks. The range is [4x2,
+// 32x32] for kObmcDirectionVertical and [2x4, 32x32] for
+// kObmcDirectionHorizontal, see Section 7.11.3.9.
+// |obmc_prediction| is the second input block.
+// |obmc_prediction_stride| is its stride, given in bytes.
+// The pointer arguments do not alias one another.
+using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
+                               int width, int height,
+                               const void* obmc_prediction,
+                               ptrdiff_t obmc_prediction_stride);
+using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
+
+// Warp function signature. Section 7.11.3.5.
+// This function applies warp filtering for each 8x8 block inside the current
+// coding block. The filtering process is similar to 2d convolve filtering.
+// The horizontal filter is applied followed by the vertical filter.
+// The function has to calculate corresponding pixel positions before and
+// after warping.
+// |source| is the input reference frame buffer.
+// |source_stride|, |source_width|, |source_height| are corresponding frame
+// stride, width, and height. |source_stride| is given in bytes.
+// |warp_params| is the matrix of warp motion: warp_params[i] = mN.
+//         [x'     (m2 m3 m0   [x
+//     z .  y'  =   m4 m5 m1 *  y
+//          1]      m6 m7 1)    1]
+// |subsampling_x/y| is the current frame's plane subsampling factor.
+// |block_start_x| and |block_start_y| are the starting position the current
+// coding block.
+// |block_width| and |block_height| are width and height of the current coding
+// block. |block_width| and |block_height| are at least 8.
+// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
+// comments in the definition of struct GlobalMotion for the range of their
+// values.
+// |dest| is the output buffer of type Pixel. The output values are clipped to
+// Pixel values.
+// |dest_stride| is the stride, in units of bytes.
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsVertical &
+// kInterRoundBitsVertical12bpp will be used.
+//
+// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
+// borders that extend the frame boundary pixels.
+// * The left and right borders must be at least 13 pixels wide. In addition,
+//   Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
+//   Therefore, there must be at least one extra padding byte after the right
+//   border of the last row in the source buffer.
+// * The top and bottom borders must be at least 13 pixels high.
+// The pointer arguments do not alias one another.
+using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
+                          int source_width, int source_height,
+                          const int* warp_params, int subsampling_x,
+                          int subsampling_y, int block_start_x,
+                          int block_start_y, int block_width, int block_height,
+                          int16_t alpha, int16_t beta, int16_t gamma,
+                          int16_t delta, void* dest, ptrdiff_t dest_stride);
+
+// Warp for compound predictions. Section 7.11.3.5.
+// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
+// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
+// is always 7 (kCompoundInterRoundBitsVertical).
+// Rounding precision is derived from the function being called. For horizontal
+// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
+// used. For vertical filtering kInterRoundBitsCompondVertical will be used.
+using WarpCompoundFunc = WarpFunc;
+
+constexpr int kNumAutoRegressionLags = 4;
+// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
+// Section 7.18.3.3, second code block
+// |params| are parameters read from frame header, mainly providing
+// auto_regression_coeff_y for the filter and auto_regression_shift to right
+// shift the filter sum by. Note: This method assumes
+// params.auto_regression_coeff_lag is not 0. Do not call this method if
+// params.auto_regression_coeff_lag is 0.
+using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                        void* luma_grain_buffer);
+// Function index is auto_regression_coeff_lag - 1.
+using LumaAutoRegressionFuncs =
+    LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
+
+// Applies an auto-regressive filter to the white noise in u_grain and v_grain.
+// Section 7.18.3.3, third code block
+// The |luma_grain_buffer| provides samples that are added to the autoregressive
+// sum when num_y_points > 0.
+// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
+// that were generated from the stored Gaussian sequence, and are overwritten
+// with the results of the autoregressive filter. |params| are parameters read
+// from frame header, mainly providing auto_regression_coeff_u and
+// auto_regression_coeff_v for each chroma plane's filter, and
+// auto_regression_shift to right shift the filter sums by.
+// The pointer arguments do not alias one another.
+using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
+                                          const void* luma_grain_buffer,
+                                          int subsampling_x, int subsampling_y,
+                                          void* u_grain_buffer,
+                                          void* v_grain_buffer);
+using ChromaAutoRegressionFuncs =
+    ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
+
+// Build an image-wide "stripe" of grain noise for every 32 rows in the image.
+// Section 7.18.3.5, first code block.
+// Each 32x32 luma block is copied at a random offset specified via
+// |grain_seed| from the grain template produced by autoregression, and the same
+// is done for chroma grains, subject to subsampling.
+// |width| and |height| are the dimensions of the overall image.
+// |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
+// Because this function treats all planes identically and independently, it is
+// simplified to take one grain buffer at a time. This means duplicating some
+// random number generations, but that work can be reduced in other ways.
+// The pointer arguments do not alias one another.
+using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
+                                           int grain_seed, int width,
+                                           int height, int subsampling_x,
+                                           int subsampling_y,
+                                           void* noise_stripes_buffer);
+using ConstructNoiseStripesFuncs =
+    ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
+
+// Compute the one or two overlap rows for each stripe copied to the noise
+// image.
+// Section 7.18.3.5, second code block. |width| and |height| are the
+// dimensions of the overall image. |noise_stripes_buffer| points to an
+// Array2DView with one row for each stripe. |noise_image_buffer| points to an
+// Array2D containing the allocated plane for this frame. Because this function
+// treats all planes identically and independently, it is simplified to take one
+// grain buffer at a time.
+// The pointer arguments do not alias one another.
+using ConstructNoiseImageOverlapFunc =
+    void (*)(const void* noise_stripes_buffer, int width, int height,
+             int subsampling_x, int subsampling_y, void* noise_image_buffer);
+
+// Populate a scaling lookup table with interpolated values of a piecewise
+// linear function where values in |point_value| are mapped to the values in
+// |point_scaling|.
+// |num_points| can be between 0 and 15. When 0, the lookup table is set to
+// zero.
+// |point_value| and |point_scaling| have |num_points| valid elements.
+// The pointer arguments do not alias one another.
+using InitializeScalingLutFunc = void (*)(int num_points,
+                                          const uint8_t point_value[],
+                                          const uint8_t point_scaling[],
+                                          int16_t* scaling_lut,
+                                          const int scaling_lut_length);
+
+// Blend noise with image. Section 7.18.3.5, third code block.
+// |width| is the width of each row, while |height| is how many rows to compute.
+// |start_height| is an offset for the noise image, to support multithreading.
+// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
+// functions, according to the code in the spec.
+// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
+// frame. They are blended with the film grain noise and written to
+// |dest_plane_y| and |dest_plane_uv| as final output for display.
+// source_plane_* and dest_plane_* may point to the same buffer, in which case
+// the film grain noise is added in place.
+// |scaling_lut_y|  and |scaling_lut| represent a piecewise linear mapping from
+// the frame's raw pixel value, to a scaling factor for the noise sample.
+// |scaling_shift| is applied as a right shift after scaling, so that scaling
+// down is possible. It is found in FilmGrainParams, but supplied directly to
+// BlendNoiseWithImageLumaFunc because it's the only member used.
+// The dest plane may point to the source plane, depending on the value of
+// frame_header.show_existing_frame. |noise_image_ptr| and scaling_lut.* do not
+// alias other arguments.
+using BlendNoiseWithImageLumaFunc = void (*)(
+    const void* noise_image_ptr, int min_value, int max_value,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y);
+
+using BlendNoiseWithImageChromaFunc = void (*)(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_value, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y, const int16_t* scaling_lut,
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv);
+
+using BlendNoiseWithImageChromaFuncs =
+    BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
+
+//------------------------------------------------------------------------------
+
+struct FilmGrainFuncs {
+  LumaAutoRegressionFuncs luma_auto_regression;
+  ChromaAutoRegressionFuncs chroma_auto_regression;
+  ConstructNoiseStripesFuncs construct_noise_stripes;
+  ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
+  InitializeScalingLutFunc initialize_scaling_lut;
+  BlendNoiseWithImageLumaFunc blend_noise_luma;
+  BlendNoiseWithImageChromaFuncs blend_noise_chroma;
+};
+
+// Motion field projection function signature. Section 7.9.
+// |reference_info| provides reference information for motion field projection.
+// |reference_to_current_with_sign| is the precalculated reference frame id
+// distance from current frame.
+// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
+// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
+// |x8_start| and |x8_end| are the start and end 8x8 columns of the current
+// tile.
+// |motion_field| is the output which saves the projected motion field
+// information.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MotionFieldProjectionKernelFunc = void (*)(
+    const ReferenceInfo& reference_info, int reference_to_current_with_sign,
+    int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* motion_field);
+
+// Compound temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offsets| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionCompoundFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    const int reference_offsets[2], int count,
+    CompoundMotionVector* candidate_mvs);
+
+// Single temporal motion vector projection function signature.
+// Section 7.9.3 and 7.10.2.10.
+// |temporal_mvs| is the aligned set of temporal reference motion vectors.
+// |temporal_reference_offsets| specifies the number of frames covered by the
+// original motion vector.
+// |reference_offset| specifies the number of frames to be covered by the
+// projected motion vector.
+// |count| is the number of the temporal motion vectors.
+// |candidate_mvs| is the aligned set of projected motion vectors.
+// The pointer arguments do not alias one another.
+// Note: Only the entry from the 8-bit Dsp table is used as this function is
+// bitdepth agnostic.
+using MvProjectionSingleFunc = void (*)(
+    const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
+    int reference_offset, int count, MotionVector* candidate_mvs);
+
+struct Dsp {
+  AverageBlendFunc average_blend;
+  CdefDirectionFunc cdef_direction;
+  CdefFilteringFuncs cdef_filters;
+  CflIntraPredictorFuncs cfl_intra_predictors;
+  CflSubsamplerFuncs cfl_subsamplers;
+  ConvolveFuncs convolve;
+  ConvolveScaleFuncs convolve_scale;
+  DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
+  DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
+  DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
+  DistanceWeightedBlendFunc distance_weighted_blend;
+  FilmGrainFuncs film_grain;
+  FilterIntraPredictorFunc filter_intra_predictor;
+  InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
+  IntraEdgeFilterFunc intra_edge_filter;
+  IntraEdgeUpsamplerFunc intra_edge_upsampler;
+  IntraPredictorFuncs intra_predictors;
+  InverseTransformAddFuncs inverse_transforms;
+  LoopFilterFuncs loop_filters;
+  LoopRestorationFuncs loop_restorations;
+  MaskBlendFuncs mask_blend;
+  MotionFieldProjectionKernelFunc motion_field_projection_kernel;
+  MvProjectionCompoundFunc mv_projection_compound[3];
+  MvProjectionSingleFunc mv_projection_single[3];
+  ObmcBlendFuncs obmc_blend;
+  SuperResCoefficientsFunc super_res_coefficients;
+  SuperResFunc super_res;
+  WarpCompoundFunc warp_compound;
+  WarpFunc warp;
+  WeightMaskFuncs weight_mask;
+};
+
+// Initializes function pointers based on build config and runtime
+// environment. Must be called once before first use. This function is
+// thread-safe.
+void DspInit();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist.
+const Dsp* GetDspTable(int bitdepth);
+
+}  // namespace dsp
+
+namespace dsp_internal {
+
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+    (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
+// Returns true if a more highly optimized version of |func| is not defined for
+// the associated bitdepth or if it is forcibly enabled with
+// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
+// to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
+// with the module.
+// |func| is one of:
+//   - FunctionName, e.g., SelfGuidedFilter.
+//   - [sub-table-index1][...-indexN] e.g.,
+//     TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
+//     used as lookups with leading 'k' removed.
+//
+//  NEON support is the only extension available for ARM and it is always
+//  required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
+//  true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func)    \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func)   \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_8BPP_SSE4_1(func)  \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
+#define DSP_ENABLED_10BPP_SSE4_1(func) \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
+// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
+// exist. This version is meant for use by test or dsp/*Init() functions only.
+dsp::Dsp* GetWritableDspTable(int bitdepth);
+
+}  // namespace dsp_internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_DSP_H_
diff --git a/src/dsp/dsp_test.cc b/src/dsp/dsp_test.cc
new file mode 100644
index 0000000..5c2a3aa
--- /dev/null
+++ b/src/dsp/dsp_test.cc
@@ -0,0 +1,258 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/dsp.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#include "tests/utils.h"
+#endif
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Maps 1D transform to the maximum valid size for the corresponding transform.
+constexpr int kMaxTransform1dSize[kNumTransform1ds] = {
+    kTransform1dSize64,  // Dct.
+    kTransform1dSize16,  // Adst.
+    kTransform1dSize32,  // Identity.
+    kTransform1dSize4,   // Wht.
+};
+
+void CheckTables(bool c_only) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  static constexpr int kBitdepths[] = {kBitdepth8, kBitdepth10};
+#else
+  static constexpr int kBitdepths[] = {kBitdepth8};
+#endif
+
+  for (const auto& bitdepth : kBitdepths) {
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    SCOPED_TRACE(absl::StrCat("bitdepth: ", bitdepth));
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      for (int j = 0; j < kNumIntraPredictors; ++j) {
+        EXPECT_NE(dsp->intra_predictors[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    EXPECT_NE(dsp->directional_intra_predictor_zone1, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone2, nullptr);
+    EXPECT_NE(dsp->directional_intra_predictor_zone3, nullptr);
+    EXPECT_NE(dsp->filter_intra_predictor, nullptr);
+    for (int i = 0; i < kNumTransformSizes; ++i) {
+      if (std::max(kTransformWidth[i], kTransformHeight[i]) == 64) {
+        EXPECT_EQ(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_EQ(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      } else {
+        EXPECT_NE(dsp->cfl_intra_predictors[i], nullptr)
+            << "index [" << i << "]";
+        for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+          EXPECT_NE(dsp->cfl_subsamplers[i][j], nullptr)
+              << "index [" << i << "][" << j << "]";
+        }
+      }
+    }
+    EXPECT_NE(dsp->intra_edge_filter, nullptr);
+    EXPECT_NE(dsp->intra_edge_upsampler, nullptr);
+    for (int i = 0; i < kNumTransform1ds; ++i) {
+      for (int j = 0; j < kNumTransform1dSizes; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          if (j <= kMaxTransform1dSize[i]) {
+            EXPECT_NE(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          } else {
+            EXPECT_EQ(dsp->inverse_transforms[i][j][k], nullptr)
+                << "index [" << i << "][" << j << "][" << k << "]";
+          }
+        }
+      }
+    }
+    for (int i = 0; i < kNumLoopFilterSizes; ++i) {
+      for (int j = 0; j < kNumLoopFilterTypes; ++j) {
+        EXPECT_NE(dsp->loop_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_NE(dsp->loop_restorations[i], nullptr) << "index [" << i << "]";
+    }
+
+    bool super_res_coefficients_is_nonnull = LIBGAV1_ENABLE_NEON;
+#if LIBGAV1_ENABLE_SSE4_1
+    const uint32_t cpu_features = GetCpuInfo();
+    super_res_coefficients_is_nonnull = (cpu_features & kSSE4_1) != 0;
+#endif
+    if (c_only) super_res_coefficients_is_nonnull = false;
+    if (super_res_coefficients_is_nonnull) {
+      EXPECT_NE(dsp->super_res_coefficients, nullptr);
+    } else {
+      EXPECT_EQ(dsp->super_res_coefficients, nullptr);
+    }
+
+    EXPECT_NE(dsp->super_res, nullptr);
+    EXPECT_NE(dsp->cdef_direction, nullptr);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        EXPECT_NE(dsp->cdef_filters[i][j], nullptr)
+            << "index [" << i << "][" << j << "]";
+      }
+    }
+    for (auto convolve_func : dsp->convolve_scale) {
+      EXPECT_NE(convolve_func, nullptr);
+    }
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          for (int m = 0; m < 2; ++m) {
+            if (j == 1 && k == 1) {
+              EXPECT_EQ(dsp->convolve[j][k][l][m], nullptr);
+            } else {
+              EXPECT_NE(dsp->convolve[j][k][l][m], nullptr);
+            }
+          }
+        }
+      }
+    }
+    for (const auto& m : dsp->mask_blend) {
+      for (int i = 0; i < 2; ++i) {
+        if (i == 0 || bitdepth >= 10) {
+          EXPECT_NE(m[i], nullptr);
+        } else {
+          EXPECT_EQ(m[i], nullptr);
+        }
+      }
+    }
+    for (const auto& m : dsp->inter_intra_mask_blend_8bpp) {
+      if (bitdepth == 8) {
+        EXPECT_NE(m, nullptr);
+      } else {
+        EXPECT_EQ(m, nullptr);
+      }
+    }
+    for (int i = kBlock4x4; i < kMaxBlockSizes; ++i) {
+      const int width_index = k4x4WidthLog2[i] - 1;
+      const int height_index = k4x4HeightLog2[i] - 1;
+      // Only block sizes >= 8x8 are handled with this function.
+      if (width_index < 0 || height_index < 0) continue;
+
+      for (size_t j = 0; j < 2; ++j) {
+        EXPECT_NE(dsp->weight_mask[width_index][height_index][j], nullptr)
+            << ToString(static_cast<BlockSize>(i)) << " index [" << width_index
+            << "]"
+            << "[" << height_index << "][" << j << "]";
+      }
+    }
+
+    EXPECT_NE(dsp->average_blend, nullptr);
+    EXPECT_NE(dsp->distance_weighted_blend, nullptr);
+    for (int i = 0; i < kNumObmcDirections; ++i) {
+      EXPECT_NE(dsp->obmc_blend[i], nullptr)
+          << "index [" << ToString(static_cast<ObmcDirection>(i)) << "]";
+    }
+    EXPECT_NE(dsp->warp, nullptr);
+    EXPECT_NE(dsp->warp_compound, nullptr);
+
+    for (int i = 0; i < kNumAutoRegressionLags - 1; ++i) {
+      EXPECT_NE(dsp->film_grain.luma_auto_regression[i], nullptr)
+          << "index [" << i << "]";
+    }
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < kNumAutoRegressionLags; ++j) {
+        if (i == 0 && j == 0) {
+          EXPECT_EQ(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        } else {
+          EXPECT_NE(dsp->film_grain.chroma_auto_regression[i][j], nullptr)
+              << " index [" << i << "]"
+              << "[" << j << "]";
+        }
+      }
+      EXPECT_NE(dsp->film_grain.construct_noise_stripes[i], nullptr)
+          << "index [" << i << "]";
+      EXPECT_NE(dsp->film_grain.blend_noise_chroma[i], nullptr)
+          << "index [" << i << "]";
+    }
+    EXPECT_NE(dsp->film_grain.construct_noise_image_overlap, nullptr);
+    EXPECT_NE(dsp->film_grain.initialize_scaling_lut, nullptr);
+    EXPECT_NE(dsp->film_grain.blend_noise_luma, nullptr);
+
+    if (bitdepth == 8) {
+      EXPECT_NE(dsp->motion_field_projection_kernel, nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[0], nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[1], nullptr);
+      EXPECT_NE(dsp->mv_projection_compound[2], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[0], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[1], nullptr);
+      EXPECT_NE(dsp->mv_projection_single[2], nullptr);
+    } else {
+      EXPECT_EQ(dsp->motion_field_projection_kernel, nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[0], nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[1], nullptr);
+      EXPECT_EQ(dsp->mv_projection_compound[2], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[0], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[1], nullptr);
+      EXPECT_EQ(dsp->mv_projection_single[2], nullptr);
+    }
+  }
+}
+
+TEST(Dsp, TablesArePopulated) {
+  DspInit();
+  CheckTables(/*c_only=*/false);
+}
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+TEST(Dsp, TablesArePopulatedCOnly) {
+  test_utils::ResetDspTable(kBitdepth8);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  test_utils::ResetDspTable(kBitdepth10);
+#endif
+  dsp_internal::DspInit_C();
+  CheckTables(/*c_only=*/true);
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+TEST(Dsp, GetDspTable) {
+  EXPECT_EQ(GetDspTable(1), nullptr);
+  EXPECT_NE(GetDspTable(8), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(1), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(8), nullptr);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  EXPECT_NE(GetDspTable(10), nullptr);
+  EXPECT_NE(dsp_internal::GetWritableDspTable(10), nullptr);
+#else
+  EXPECT_EQ(GetDspTable(10), nullptr);
+  EXPECT_EQ(dsp_internal::GetWritableDspTable(10), nullptr);
+#endif
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc
new file mode 100644
index 0000000..fa12b69
--- /dev/null
+++ b/src/dsp/film_grain.cc
@@ -0,0 +1,880 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+template <int bitdepth>
+void InitializeScalingLookupTable_C(int num_points, const uint8_t point_value[],
+                                    const uint8_t point_scaling[],
+                                    int16_t* scaling_lut,
+                                    const int scaling_lut_length) {
+  if (num_points == 0) {
+    memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
+    return;
+  }
+  constexpr int index_shift = bitdepth - kBitdepth8;
+  static_assert(sizeof(scaling_lut[0]) == 2, "");
+  Memset(scaling_lut, point_scaling[0],
+         std::max(static_cast<int>(point_value[0]), 1) << index_shift);
+  for (int i = 0; i < num_points - 1; ++i) {
+    const int delta_y = point_scaling[i + 1] - point_scaling[i];
+    const int delta_x = point_value[i + 1] - point_value[i];
+    const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+    for (int x = 0; x < delta_x; ++x) {
+      const int v = point_scaling[i] + ((x * delta + 32768) >> 16);
+      assert(v >= 0 && v <= UINT8_MAX);
+      const int lut_index = (point_value[i] + x) << index_shift;
+      scaling_lut[lut_index] = v;
+    }
+  }
+  const int16_t last_point_value = point_value[num_points - 1];
+  const int x_base = last_point_value << index_shift;
+  Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+         scaling_lut_length - x_base);
+  // Fill in the gaps.
+  if (bitdepth == kBitdepth10) {
+    for (int x = 4; x < x_base + 4; x += 4) {
+      const int start = scaling_lut[x - 4];
+      const int end = scaling_lut[x];
+      const int delta = end - start;
+      scaling_lut[x - 3] = start + RightShiftWithRounding(delta, 2);
+      scaling_lut[x - 2] = start + RightShiftWithRounding(2 * delta, 2);
+      scaling_lut[x - 1] = start + RightShiftWithRounding(3 * delta, 2);
+    }
+  }
+}
+
+// Section 7.18.3.5.
+template <int bitdepth>
+int ScaleLut(const int16_t* scaling_lut, int index) {
+  if (bitdepth <= kBitdepth10) {
+    assert(index < kScalingLookupTableSize << (bitdepth - 2));
+    return scaling_lut[index];
+  }
+  // Performs a piecewise linear interpolation into the scaling table.
+  const int shift = bitdepth - kBitdepth8;
+  const int quotient = index >> shift;
+  const int remainder = index - (quotient << shift);
+  assert(quotient + 1 < kScalingLookupTableSize);
+  const int start = scaling_lut[quotient];
+  const int end = scaling_lut[quotient + 1];
+  return start + RightShiftWithRounding((end - start) * remainder, shift);
+}
+
+// Applies an auto-regressive filter to the white noise in luma_grain.
+template <int bitdepth, typename GrainType>
+void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params,
+                                            void* luma_grain_buffer) {
+  auto* luma_grain = static_cast<GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int auto_regression_coeff_lag = params.auto_regression_coeff_lag;
+  assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3);
+  // A pictorial representation of the auto-regressive filter for various values
+  // of auto_regression_coeff_lag. The letter 'O' represents the current sample.
+  // (The filter always operates on the current sample with filter
+  // coefficient 1.) The letters 'X' represent the neighboring samples that the
+  // filter operates on.
+  //
+  // auto_regression_coeff_lag == 3:
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X X X X X
+  //   X X X O
+  // auto_regression_coeff_lag == 2:
+  //     X X X X X
+  //     X X X X X
+  //     X X O
+  // auto_regression_coeff_lag == 1:
+  //       X X X
+  //       X O
+  // auto_regression_coeff_lag == 0:
+  //         O
+  //
+  // Note that if auto_regression_coeff_lag is 0, the filter is the identity
+  // filter and therefore can be skipped. This implementation assumes it is not
+  // called in that case.
+  const int shift = params.auto_regression_shift;
+  for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) {
+    for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder;
+         ++x) {
+      int sum = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      // The last iteration (delta_row == 0) is shorter and is handled
+      // separately.
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] *
+                 coeff;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row < 0);
+      // Last iteration: delta_row == 0.
+      {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          const int coeff = params.auto_regression_coeff_y[pos];
+          sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff;
+          ++pos;
+        } while (++delta_column < 0);
+      }
+      luma_grain[y * kLumaWidth + x] = Clip3(
+          luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
+          bool use_luma>
+void ApplyAutoRegressiveFilterToChromaGrains_C(
+    const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+    int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+    void* LIBGAV1_RESTRICT v_grain_buffer) {
+  static_assert(
+      auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3,
+      "Unsupported autoregression lag for chroma.");
+  const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int grain_max = GetGrainMax<bitdepth>();
+  auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
+  auto* v_grain = static_cast<GrainType*>(v_grain_buffer);
+  const int shift = params.auto_regression_shift;
+  const int chroma_height =
+      (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight;
+  const int chroma_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  for (int y = kAutoRegressionBorder; y < chroma_height; ++y) {
+    const int luma_y =
+        ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder;
+    for (int x = kAutoRegressionBorder;
+         x < chroma_width - kAutoRegressionBorder; ++x) {
+      int sum_u = 0;
+      int sum_v = 0;
+      int pos = 0;
+      int delta_row = -auto_regression_coeff_lag;
+      do {
+        int delta_column = -auto_regression_coeff_lag;
+        do {
+          if (delta_row == 0 && delta_column == 0) {
+            break;
+          }
+          const int coeff_u = params.auto_regression_coeff_u[pos];
+          const int coeff_v = params.auto_regression_coeff_v[pos];
+          sum_u +=
+              u_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_u;
+          sum_v +=
+              v_grain[(y + delta_row) * chroma_width + (x + delta_column)] *
+              coeff_v;
+          ++pos;
+        } while (++delta_column <= auto_regression_coeff_lag);
+      } while (++delta_row <= 0);
+      if (use_luma) {
+        int luma = 0;
+        const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) +
+                           kAutoRegressionBorder;
+        int i = 0;
+        do {
+          int j = 0;
+          do {
+            luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
+          } while (++j <= subsampling_x);
+        } while (++i <= subsampling_y);
+        luma = SubsampledValue(luma, subsampling_x + subsampling_y);
+        const int coeff_u = params.auto_regression_coeff_u[pos];
+        const int coeff_v = params.auto_regression_coeff_v[pos];
+        sum_u += luma * coeff_u;
+        sum_v += luma * coeff_v;
+      }
+      u_grain[y * chroma_width + x] = Clip3(
+          u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift),
+          grain_min, grain_max);
+      v_grain[y * chroma_width + x] = Clip3(
+          v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift),
+          grain_min, grain_max);
+    }
+  }
+}
+
+// This implementation is for the condition overlap_flag == false.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripes_C(const void* LIBGAV1_RESTRICT grain_buffer,
+                             int grain_seed, int width, int height,
+                             int subsampling_x, int subsampling_y,
+                             void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    int x = 0;
+    do {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x)));
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+      x += 16;
+    } while (x < half_width);
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+// This implementation is for the condition overlap_flag == true.
+template <int bitdepth, typename GrainType>
+void ConstructNoiseStripesWithOverlap_C(
+    const void* LIBGAV1_RESTRICT grain_buffer, int grain_seed, int width,
+    int height, int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_stripes_buffer) {
+  auto* noise_stripes =
+      static_cast<Array2DView<GrainType>*>(noise_stripes_buffer);
+  const auto* grain = static_cast<const GrainType*>(grain_buffer);
+  const int half_width = DivideBy2(width + 1);
+  const int half_height = DivideBy2(height + 1);
+  assert(half_width > 0);
+  assert(half_height > 0);
+  static_assert(kLumaWidth == kMaxChromaWidth,
+                "kLumaWidth width should be equal to kMaxChromaWidth");
+  const int grain_width =
+      (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth;
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  constexpr int kNoiseStripeHeight = 34;
+  int luma_num = 0;
+  int y = 0;
+  do {
+    GrainType* const noise_stripe = (*noise_stripes)[luma_num];
+    uint16_t seed = grain_seed;
+    seed ^= ((luma_num * 37 + 178) & 255) << 8;
+    seed ^= ((luma_num * 173 + 105) & 255);
+    // Begin special iteration for x == 0.
+    const int rand = GetFilmGrainRandomNumber(8, &seed);
+    const int offset_x = rand >> 4;
+    const int offset_y = rand & 15;
+    const int plane_offset_x =
+        (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+    const int plane_offset_y =
+        (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+    // The overlap computation only occurs when x > 0, so it is omitted here.
+    int i = 0;
+    do {
+      const int copy_size =
+          std::min(kNoiseStripeHeight >> subsampling_x, plane_width);
+      memcpy(&noise_stripe[i * plane_width],
+             &grain[(plane_offset_y + i) * grain_width + plane_offset_x],
+             copy_size * sizeof(noise_stripe[0]));
+    } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    // End special iteration for x == 0.
+    for (int x = 16; x < half_width; x += 16) {
+      const int rand = GetFilmGrainRandomNumber(8, &seed);
+      const int offset_x = rand >> 4;
+      const int offset_y = rand & 15;
+      const int plane_offset_x =
+          (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2;
+      const int plane_offset_y =
+          (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2;
+      int i = 0;
+      do {
+        int j = 0;
+        int grain_sample =
+            grain[(plane_offset_y + i) * grain_width + plane_offset_x];
+        // The first pixel(s) of each segment of the noise_stripe are subject to
+        // the "overlap" computation.
+        if (subsampling_x == 0) {
+          // Corresponds to the line in the spec:
+          // if (j < 2 && x > 0)
+          // j = 0
+          int old = noise_stripe[i * plane_width + x * 2];
+          grain_sample = old * 27 + grain_sample * 17;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2] = grain_sample;
+
+          // This check prevents overwriting for the iteration j = 1. The
+          // continue applies to the i-loop.
+          if (x * 2 + 1 >= plane_width) continue;
+          // j = 1
+          grain_sample =
+              grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1];
+          old = noise_stripe[i * plane_width + x * 2 + 1];
+          grain_sample = old * 17 + grain_sample * 27;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x * 2 + 1] = grain_sample;
+          j = 2;
+        } else {
+          // Corresponds to the line in the spec:
+          // if (j == 0 && x > 0)
+          const int old = noise_stripe[i * plane_width + x];
+          grain_sample = old * 23 + grain_sample * 22;
+          grain_sample =
+              Clip3(RightShiftWithRounding(grain_sample, 5),
+                    GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+          noise_stripe[i * plane_width + x] = grain_sample;
+          j = 1;
+        }
+        // The following covers the rest of the loop over j as described in the
+        // spec.
+        //
+        // Section 7.18.3.5 says:
+        //   noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples
+        //   wide (a few additional samples across are actually written to
+        //   the array, but these are never read) ...
+        //
+        // Note: The warning in the parentheses also applies to
+        // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ].
+        //
+        // Writes beyond the width of each row could happen below. To
+        // prevent those writes, we clip the number of pixels to copy against
+        // the remaining width.
+        const int copy_size =
+            std::min(kNoiseStripeHeight >> subsampling_x,
+                     plane_width - (x << (1 - subsampling_x))) -
+            j;
+        memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j],
+               &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j],
+               copy_size * sizeof(noise_stripe[0]));
+      } while (++i < (kNoiseStripeHeight >> subsampling_y));
+    }
+
+    ++luma_num;
+    y += 16;
+  } while (y < half_height);
+}
+
+template <int bitdepth, typename GrainType>
+inline void WriteOverlapLine_C(
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row,
+    const GrainType* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+    int grain_coeff, int old_coeff,
+    GrainType* LIBGAV1_RESTRICT noise_image_row) {
+  int x = 0;
+  do {
+    int grain = noise_stripe_row[x];
+    const int old = noise_stripe_row_prev[x];
+    grain = old * old_coeff + grain * grain_coeff;
+    grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin<bitdepth>(),
+                  GetGrainMax<bitdepth>());
+    noise_image_row[x] = grain;
+  } while (++x < plane_width);
+}
+
+template <int bitdepth, typename GrainType>
+void ConstructNoiseImageOverlap_C(
+    const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+    int subsampling_x, int subsampling_y,
+    void* LIBGAV1_RESTRICT noise_image_buffer) {
+  const auto* noise_stripes =
+      static_cast<const Array2DView<GrainType>*>(noise_stripes_buffer);
+  auto* noise_image = static_cast<Array2D<GrainType>*>(noise_image_buffer);
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = stripe_height;
+  int luma_num = 1;
+  if (subsampling_y == 0) {
+    // Begin complete stripes section. This is when we are guaranteed to have
+    // two overlap rows in each stripe.
+    for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      // First overlap row.
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[32 * plane_width],
+                                   plane_width, 17, 27, (*noise_image)[y]);
+      // Second overlap row.
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+    // End complete stripes section.
+
+    const int remaining_height = plane_height - y;
+    // Either one partial stripe remains (remaining_height  > 0),
+    // OR image is less than one stripe high (remaining_height < 0),
+    // OR all stripes are completed (remaining_height == 0).
+    if (remaining_height <= 0) {
+      return;
+    }
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+    WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                 &noise_stripe_prev[32 * plane_width],
+                                 plane_width, 17, 27, (*noise_image)[y]);
+
+    // Check if second overlap row is in the image.
+    if (remaining_height > 1) {
+      WriteOverlapLine_C<bitdepth>(&noise_stripe[plane_width],
+                                   &noise_stripe_prev[(32 + 1) * plane_width],
+                                   plane_width, 27, 17, (*noise_image)[y + 1]);
+    }
+  } else {  // |subsampling_y| == 1
+    // No special checks needed for partial stripes, because if one exists, the
+    // first and only overlap row is guaranteed to exist.
+    for (; y < plane_height; ++luma_num, y += stripe_height) {
+      const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+      const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+      WriteOverlapLine_C<bitdepth>(noise_stripe,
+                                   &noise_stripe_prev[16 * plane_width],
+                                   plane_width, 22, 23, (*noise_image)[y]);
+    }
+  }
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_C(const void* LIBGAV1_RESTRICT noise_image_ptr,
+                               int min_value, int max_luma, int scaling_shift,
+                               int width, int height, int start_height,
+                               const int16_t* scaling_lut_y,
+                               const void* source_plane_y,
+                               ptrdiff_t source_stride_y, void* dest_plane_y,
+                               ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int orig = in_y[y * source_stride_y + x];
+      int noise = noise_image[kPlaneY][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_y, orig) * noise, scaling_shift);
+      out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma);
+    } while (++x < width);
+  } while (++y < height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChroma_C(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut_uv,
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig = in_uv[y * source_stride_uv + x];
+      const int combined = average_luma * luma_multiplier + orig * multiplier;
+      const int merged =
+          Clip3((combined >> 6) + LeftShift(offset, bitdepth - kBitdepth8), 0,
+                (1 << bitdepth) - 1);
+      int noise = noise_image[plane][y + start_height][x];
+      noise = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut_uv, merged) * noise, scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig + noise, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_C(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut, const void* source_plane_y,
+    ptrdiff_t source_stride_y, const void* source_plane_uv,
+    ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int scaling_shift = params.chroma_scaling;
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const int luma_x = x << subsampling_x;
+      const int luma_y = y << subsampling_y;
+      const int luma_next_x = std::min(luma_x + 1, width - 1);
+      int average_luma;
+      if (subsampling_x != 0) {
+        average_luma = RightShiftWithRounding(
+            in_y[luma_y * source_stride_y + luma_x] +
+                in_y[luma_y * source_stride_y + luma_next_x],
+            1);
+      } else {
+        average_luma = in_y[luma_y * source_stride_y + luma_x];
+      }
+      const int orig_uv = in_uv[y * source_stride_uv + x];
+      int noise_uv = noise_image[plane][y + start_height][x];
+      noise_uv = RightShiftWithRounding(
+          ScaleLut<bitdepth>(scaling_lut, average_luma) * noise_uv,
+          scaling_shift);
+      out_uv[y * dest_stride_uv + x] =
+          Clip3(orig_uv + noise_uv, min_value, max_chroma);
+    } while (++x < chroma_width);
+  } while (++y < chroma_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth8>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth8, int8_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth8, int8_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth8, int8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth8>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth8, int8_t, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // LumaAutoRegressionFunc
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+
+  // ChromaAutoRegressionFunc
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+
+  // ConstructNoiseStripesFunc
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+
+  // ConstructNoiseImageOverlapFunc
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+
+  // InitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth10>;
+
+  // BlendNoiseWithImageLumaFunc
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+
+  // BlendNoiseWithImageChromaFunc
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma
+  dsp->film_grain.luma_auto_regression[0] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[1] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+  dsp->film_grain.luma_auto_regression[2] =
+      ApplyAutoRegressiveFilterToLumaGrain_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma
+  // Chroma autoregression should never be called when lag is 0 and use_luma is
+  // false.
+  dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
+  dsp->film_grain.chroma_auto_regression[0][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, false>;
+  dsp->film_grain.chroma_auto_regression[0][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, false>;
+  dsp->film_grain.chroma_auto_regression[0][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, false>;
+  dsp->film_grain.chroma_auto_regression[1][0] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 0, true>;
+  dsp->film_grain.chroma_auto_regression[1][1] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 1, true>;
+  dsp->film_grain.chroma_auto_regression[1][2] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 2, true>;
+  dsp->film_grain.chroma_auto_regression[1][3] =
+      ApplyAutoRegressiveFilterToChromaGrains_C<kBitdepth10, int16_t, 3, true>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes
+  dsp->film_grain.construct_noise_stripes[0] =
+      ConstructNoiseStripes_C<kBitdepth10, int16_t>;
+  dsp->film_grain.construct_noise_stripes[1] =
+      ConstructNoiseStripesWithOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap
+  dsp->film_grain.construct_noise_image_overlap =
+      ConstructNoiseImageOverlap_C<kBitdepth10, int16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc
+  dsp->film_grain.initialize_scaling_lut =
+      InitializeScalingLookupTable_C<kBitdepth10>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma
+  dsp->film_grain.blend_noise_chroma[0] =
+      BlendNoiseWithImageChroma_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_C<kBitdepth10, int16_t, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace film_grain
+
+void FilmGrainInit_C() {
+  film_grain::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h
new file mode 100644
index 0000000..f75a354
--- /dev/null
+++ b/src/dsp/film_grain.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/film_grain_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize Dsp::film_grain_synthesis. This function is not thread-safe.
+void FilmGrainInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_H_
diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h
new file mode 100644
index 0000000..2e6ad45
--- /dev/null
+++ b/src/dsp/film_grain_common.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+
+template <int bitdepth>
+int GetGrainMax() {
+  return (1 << (bitdepth - 1)) - 1;
+}
+
+template <int bitdepth>
+int GetGrainMin() {
+  return -(1 << (bitdepth - 1));
+}
+
+inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) {
+  uint16_t s = *seed;
+  uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1;
+  s = (s >> 1) | (bit << 15);
+  *seed = s;
+  return s >> (16 - bits);
+}
+
+enum {
+  kAutoRegressionBorder = 3,
+  // The width of the luma noise array.
+  kLumaWidth = 82,
+  // The height of the luma noise array.
+  kLumaHeight = 73,
+  // The two possible widths of the chroma noise array.
+  kMinChromaWidth = 44,
+  kMaxChromaWidth = 82,
+  // The two possible heights of the chroma noise array.
+  kMinChromaHeight = 38,
+  kMaxChromaHeight = 73,
+  // The standard scaling lookup table maps bytes to bytes, so only uses 256
+  // elements, plus one for overflow in 12bpp lookups. The size is scaled up for
+  // 10bpp.
+  kScalingLookupTableSize = 257,
+  // Padding is added to the scaling lookup table to permit overwrites by
+  // InitializeScalingLookupTable_NEON.
+  kScalingLookupTablePadding = 6,
+  // Padding is added to each row of the noise image to permit overreads by
+  // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON.
+  kNoiseImagePadding = 15,
+  // Padding is added to the end of the |noise_stripes_| buffer to permit
+  // overreads by WriteOverlapLine8bpp_NEON.
+  kNoiseStripePadding = 7,
+};  // anonymous enum
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_
diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc
new file mode 100644
index 0000000..fe66db2
--- /dev/null
+++ b/src/dsp/intra_edge.cc
@@ -0,0 +1,115 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxUpsampleSize = 16;
+
+template <typename Pixel>
+void IntraEdgeFilter_C(void* buffer, int size, int strength) {
+  assert(strength > 0);
+  Pixel edge[129];
+  memcpy(edge, buffer, sizeof(edge[0]) * size);
+  auto* const dst_buffer = static_cast<Pixel*>(buffer);
+  const int kernel_index = strength - 1;
+  for (int i = 1; i < size; ++i) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(i + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[i] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsampler_C(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<Pixel*>(buffer);
+  Pixel temp[kMaxUpsampleSize + 3];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  for (int i = 0; i < size; ++i) {
+    const int sum =
+        -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3];
+    pixel_buffer[2 * i - 1] =
+        Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1);
+    pixel_buffer[2 * i] = temp[i + 2];
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter
+  dsp->intra_edge_filter = IntraEdgeFilter_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void IntraEdgeInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intra_edge.h b/src/dsp/intra_edge.h
new file mode 100644
index 0000000..172ecbb
--- /dev/null
+++ b/src/dsp/intra_edge.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intra_edge_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intra_edge_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRA_EDGE_H_
diff --git a/src/dsp/intra_edge_test.cc b/src/dsp/intra_edge_test.cc
new file mode 100644
index 0000000..aca6f9e
--- /dev/null
+++ b/src/dsp/intra_edge_test.cc
@@ -0,0 +1,518 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+const char kIntraEdge[] = "IntraEdge";
+const char kIntraEdgeFilterName[] = "Intra Edge Filter";
+const char kIntraEdgeUpsamplerName[] = "Intra Edge Upsampler";
+
+constexpr int kIntraEdgeBufferSize = 144;  // see Tile::IntraPrediction.
+constexpr int kIntraEdgeFilterTestMaxSize = 129;
+constexpr int kIntraEdgeFilterTestFixedInput[kIntraEdgeFilterTestMaxSize] = {
+    159, 208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253,
+    233, 139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103,
+    239, 147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165,
+    164, 63,  171, 143, 210, 236, 253, 233, 139, 113, 66,  211, 133, 61,  91,
+    123, 187, 76,  110, 172, 61,  103, 239, 147, 247, 120, 18,  106, 180, 159,
+    208, 54,  136, 205, 124, 125, 165, 164, 63,  171, 143, 210, 236, 253, 233,
+    139, 113, 66,  211, 133, 61,  91,  123, 187, 76,  110, 172, 61,  103, 239,
+    147, 247, 120, 18,  106, 180, 159, 208, 54,  136, 205, 124, 125, 165, 164,
+    63,  171, 143, 210, 236, 253, 233, 139, 113,
+};
+constexpr int kIntraEdgeUpsamplerTestFixedInput[] = {
+    208, 54,  136, 205, 124, 125, 165, 164, 63,
+    171, 143, 210, 236, 208, 54,  136, 205};
+
+struct EdgeFilterParams {
+  int size;
+  int strength;
+};
+
+std::ostream& operator<<(std::ostream& os, const EdgeFilterParams& param) {
+  return os << "size: " << param.size << ", strength: " << param.strength;
+}
+
+// Each size is paired with strength 1, 2, and 3.
+// In general, the size is expressible as 2^n+1, but all sizes up to 129 are
+// permissible.
+constexpr EdgeFilterParams kIntraEdgeFilterParamList[] = {
+    {1, 1},  {1, 2},  {1, 3},  {2, 1},   {2, 2},   {2, 3},  {5, 1},  {5, 2},
+    {5, 3},  {9, 1},  {9, 2},  {9, 3},   {17, 1},  {17, 2}, {17, 3}, {33, 1},
+    {33, 2}, {33, 3}, {50, 1}, {50, 2},  {50, 3},  {55, 1}, {55, 2}, {55, 3},
+    {65, 1}, {65, 2}, {65, 3}, {129, 1}, {129, 2}, {129, 3}};
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeFilterTest : public testing::TestWithParam<EdgeFilterParams> {
+ public:
+  IntraEdgeFilterTest() = default;
+  IntraEdgeFilterTest(const IntraEdgeFilterTest&) = delete;
+  IntraEdgeFilterTest& operator=(const IntraEdgeFilterTest&) = delete;
+  ~IntraEdgeFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_filter_ = dsp->intra_edge_filter;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_filter_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraEdgeInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+#if LIBGAV1_MSAN
+    // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+    // assembly code (safely) overreading to fill a register.
+    memset(buffer_, 0, sizeof(buffer_));
+#endif  // LIBGAV1_MSAN
+    cur_intra_edge_filter_ = dsp->intra_edge_filter;
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[kIntraEdgeBufferSize];
+  Pixel base_buffer_[kIntraEdgeBufferSize];
+  int strength_ = GetParam().strength;
+  int size_ = GetParam().size;
+
+  IntraEdgeFilterFunc base_intra_edge_filter_;
+  IntraEdgeFilterFunc cur_intra_edge_filter_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_filter_ == nullptr) return;
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    buffer_[i] = kIntraEdgeFilterTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_filter_(buffer_, size_, strength_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeFilterName, digest, buffer_,
+                             kIntraEdgeFilterTestMaxSize * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeFilterTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_filter_ == nullptr) return;
+  if (cur_intra_edge_filter_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration elapsed_time;
+  absl::Duration base_elapsed_time;
+  memset(base_buffer_, 0, sizeof(base_buffer_));
+  memset(buffer_, 0, sizeof(buffer_));
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    for (int i = 0; i < size_; ++i) {
+      const Pixel val = rnd(1 << bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_filter_(base_buffer_, size_, strength_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_filter_(buffer_, size_, strength_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] Size %3d Strength %d C: %5d us SIMD: %5d us %2.2fx\n",
+           kIntraEdge, kIntraEdgeFilterName, size_, strength_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s] Size %3d Strength %d\n", kIntraEdge,
+           kIntraEdgeFilterName, size_, strength_);
+  }
+  for (int i = 0; i < kIntraEdgeFilterTestMaxSize; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeFilterTest8bpp = IntraEdgeFilterTest<8, uint8_t>;
+
+const char* GetIntraEdgeFilterDigest8bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "f7f681cf7047602fafc7fb416ecf46e1", "f7f681cf7047602fafc7fb416ecf46e1",
+      "f7f681cf7047602fafc7fb416ecf46e1"};
+  static const char* const kDigestsSize2[3] = {
+      "cb24cc54900fb75d767f3de797451e43", "380c80c89e1e8cda81ee0d3d4b29b8b7",
+      "a7eb3dba95ff35c2df45a274afbc9772"};
+  static const char* const kDigestsSize5[3] = {
+      "23380cb37688d4c3a8f70a276be65eed", "ec1e23d5b996a527ed3d45c0d552bf22",
+      "d313523d3b7646fdbb873c61ffe7a51a"};
+  static const char* const kDigestsSize9[3] = {
+      "e79597e9d62893754fc77d80ca86329a", "f7644e9748984914100e7031c6432272",
+      "bdf4f16734c86338716fb436c196ecc6"};
+  static const char* const kDigestsSize17[3] = {
+      "13ad15c833e850348eecb9fea4f3cadb", "e5988a72391250c702a8192893df40dd",
+      "8f68603598638fa33203fe1233d273b1"};
+  static const char* const kDigestsSize33[3] = {
+      "51156da8f4d527e0c011040769987dbd", "eff17eaf73a7bb7fd4c921510ade9f67",
+      "aca87680e0649d0728091c92c6de8871"};
+  static const char* const kDigestsSize50[3] = {
+      "87c1d43751125f1ea4987517a90d378d", "942a9d056231683bdfc52346b6b032c2",
+      "16a9148daf0e5f69808b9f0caa1ef110"};
+  static const char* const kDigestsSize55[3] = {
+      "833480d74957fb0356dec5b09412eefa", "a307ef31f10affc3b7fb262d05f1b80a",
+      "0318b2fde088c472215fe155f3b48d36"};
+  static const char* const kDigestsSize65[3] = {
+      "5000dada34ed2e6692bb44a4398ddf53", "8da6c776d897064ecd4a1e84aae92dd3",
+      "d7c71db339c28d33119974987b2f9d85"};
+  static const char* const kDigestsSize129[3] = {
+      "bf174d8b45b8131404fd4a4686f8c117", "e81518d6d85eed2f1b18c59424561d6b",
+      "7306715602b0f5536771724a2f0a39bc"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeFilterDigest8bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest8bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeFilterTest10bpp = IntraEdgeFilterTest<10, uint16_t>;
+
+const char* GetIntraEdgeFilterDigest10bpp(int strength, int size) {
+  static const char* const kDigestsSize1[3] = {
+      "2d2088560e3ccb5b809c97f5299bb1c0", "2d2088560e3ccb5b809c97f5299bb1c0",
+      "2d2088560e3ccb5b809c97f5299bb1c0"};
+  static const char* const kDigestsSize2[3] = {
+      "db3e785852e98fba18a1fb531f68466c", "8caea330489bc6ed0f99fbf769f53181",
+      "bcdd1b21f3baf5f6f29caea9ef93fb0c"};
+  static const char* const kDigestsSize5[3] = {
+      "326f4193a62f5a959b86d95f5204608e", "4673e453203f75eae97ef44f43f098f2",
+      "48d516b06313683aca30e975ce6a3cad"};
+  static const char* const kDigestsSize9[3] = {
+      "79217575a32e36a51d9dd40621af9c2d", "ccec1c16bc09b28ad6513c5e4c48b6d2",
+      "bb61aa9c5fa720c667a053769e7b7d08"};
+  static const char* const kDigestsSize17[3] = {
+      "46d90e99ba46e89326a5fa547bcd9361", "824aee8950aecb356d5f4a91dbc90a7d",
+      "37d44d10a2545385af1da55f8c08564f"};
+  static const char* const kDigestsSize33[3] = {
+      "c95108e06eb2aef61ecb6839af306edd", "832c695460b4dd2b85c5f8726e4470d1",
+      "994902f549eefd83fbcbf7ecb7dc5cca"};
+  static const char* const kDigestsSize50[3] = {
+      "48119ef1436c3a4fe69d275bbaafedf8", "72c221c91c3df0a324ccbc9acea35f89",
+      "84e40aadcc416ef3f51cea3cc23b30c7"};
+  static const char* const kDigestsSize55[3] = {
+      "6b68e4e0b00c4eb38a6d0d83c0f34658", "43a919f928a80379df5c9e07c9d8000d",
+      "7c320d55b11f93185b811bdaa379f2db"};
+  static const char* const kDigestsSize65[3] = {
+      "c28de89cf9f3bc5a904647ab2c64caf7", "7ce63b1b28dce0624fc7586e8fb3ab8f",
+      "d06e6b88585f7f1a1f6af5bb59ee2180"};
+  static const char* const kDigestsSize129[3] = {
+      "79160902c5c85004382d5ffa549b43cc", "3b0df95c3ca7b0b559b79234cf434738",
+      "500786d8561effec283d4f3d13886f8c"};
+
+  switch (size) {
+    case 1:
+      return kDigestsSize1[strength - 1];
+    case 2:
+      return kDigestsSize2[strength - 1];
+    case 5:
+      return kDigestsSize5[strength - 1];
+    case 9:
+      return kDigestsSize9[strength - 1];
+    case 17:
+      return kDigestsSize17[strength - 1];
+    case 33:
+      return kDigestsSize33[strength - 1];
+    case 50:
+      return kDigestsSize50[strength - 1];
+    case 55:
+      return kDigestsSize55[strength - 1];
+    case 65:
+      return kDigestsSize65[strength - 1];
+    case 129:
+      return kDigestsSize129[strength - 1];
+    default:
+      ADD_FAILURE() << "Unknown edge size: " << size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeFilterDigest10bpp(strength_, size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeFilterTest10bpp, DISABLED_Speed) { TestRandomValues(1e7); }
+#endif
+
+template <int bitdepth, typename Pixel>
+class IntraEdgeUpsamplerTest : public testing::TestWithParam<int> {
+ public:
+  IntraEdgeUpsamplerTest() = default;
+  IntraEdgeUpsamplerTest(const IntraEdgeUpsamplerTest&) = delete;
+  IntraEdgeUpsamplerTest& operator=(const IntraEdgeUpsamplerTest&) = delete;
+  ~IntraEdgeUpsamplerTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    IntraEdgeInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_intra_edge_upsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraEdgeInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraEdgeInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_intra_edge_upsampler_ = dsp->intra_edge_upsampler;
+#if LIBGAV1_MSAN
+    // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+    // assembly code (safely) overreading to fill a register.
+    memset(buffer_, 0, sizeof(buffer_));
+#endif
+  }
+
+  void TestFixedValues(const char* digest);
+  void TestRandomValues(int num_runs);
+
+  Pixel buffer_[128];
+  Pixel base_buffer_[128];
+  int size_ = GetParam();
+
+  IntraEdgeUpsamplerFunc base_intra_edge_upsampler_;
+  IntraEdgeUpsamplerFunc cur_intra_edge_upsampler_;
+};
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestFixedValues(
+    const char* const digest) {
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  buffer_[0] = 0;
+  for (int i = 0; i < size_ + 1; ++i) {
+    buffer_[i + 1] = kIntraEdgeUpsamplerTestFixedInput[i];
+  }
+  const absl::Time start = absl::Now();
+  cur_intra_edge_upsampler_(buffer_ + 2, size_);
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(kIntraEdge, kIntraEdgeUpsamplerName, digest,
+                             buffer_, (size_ * 2 + 1) * sizeof(buffer_[0]),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void IntraEdgeUpsamplerTest<bitdepth, Pixel>::TestRandomValues(int num_runs) {
+  if (base_intra_edge_upsampler_ == nullptr) return;
+  if (cur_intra_edge_upsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  absl::Duration base_elapsed_time;
+  absl::Duration elapsed_time;
+  for (int num_tests = 0; num_tests < num_runs; ++num_tests) {
+    // Populate what will be buffer[-2..size] when passed to the upsample
+    // function.
+    buffer_[0] = 0;
+    base_buffer_[0] = 0;
+    for (int i = 1; i < size_ + 2; ++i) {
+      const Pixel val = rnd(1 << bitdepth);
+      buffer_[i] = val;
+      base_buffer_[i] = val;
+    }
+    const absl::Time base_start = absl::Now();
+    base_intra_edge_upsampler_(base_buffer_ + 2, size_);
+    base_elapsed_time += absl::Now() - base_start;
+    const absl::Time start = absl::Now();
+    cur_intra_edge_upsampler_(buffer_ + 2, size_);
+    elapsed_time += absl::Now() - start;
+  }
+  if (num_runs > 1) {
+    printf("Mode %s[%31s] size %d C: %5d us SIMD: %5d us %2.2fx\n", kIntraEdge,
+           kIntraEdgeUpsamplerName, size_,
+           static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time)),
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+           absl::ToDoubleMicroseconds(base_elapsed_time) /
+               absl::ToDoubleMicroseconds(elapsed_time));
+  } else {
+    printf("Mode %s[%31s]: size %d \n", kIntraEdge, kIntraEdgeUpsamplerName,
+           size_);
+  }
+
+  for (int i = 0; i < size_ * 2 + 1; ++i) {
+    EXPECT_EQ(buffer_[i], base_buffer_[i]) << "Mismatch in index: " << i;
+  }
+}
+
+using IntraEdgeUpsamplerTest8bpp = IntraEdgeUpsamplerTest<8, uint8_t>;
+
+constexpr int kIntraEdgeUpsampleSizes[] = {4, 8, 12, 16};
+
+const char* GetIntraEdgeUpsampleDigest8bpp(int size) {
+  switch (size) {
+    case 4:
+      return "aa9002e03f8d15eb26bbee76f40bb923";
+    case 8:
+      return "cacfca86d65eff0d951eb21fc15f242a";
+    case 12:
+      return "0529e00a1fa80bc866fa7662ad2d7b9f";
+    case 16:
+      return "03e3b3e0ea438ea48ef05651c0a54986";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, Correctness) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest8bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest8bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraEdgeUpsamplerTest10bpp = IntraEdgeUpsamplerTest<10, uint16_t>;
+
+const char* GetIntraEdgeUpsampleDigest10bpp(int size) {
+  switch (size) {
+    case 4:
+      return "341c6bb705a02bba65b34f92d8ca83cf";
+    case 8:
+      return "fdbe4b3b341921dcb0edf00dfc4d7667";
+    case 12:
+      return "ad69a491287495ec9973d4006d5ac461";
+    case 16:
+      return "04acf32e517d80ce4c4958e711b9b890";
+    default:
+      ADD_FAILURE() << "Unknown upsample size: " << size;
+      return "";
+  }
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, FixedInput) {
+  TestFixedValues(GetIntraEdgeUpsampleDigest10bpp(size_));
+  TestRandomValues(1);
+}
+
+TEST_P(IntraEdgeUpsamplerTest10bpp, DISABLED_Speed) { TestRandomValues(5e7); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest8bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+#endif
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest8bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(C, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeFilterTest10bpp,
+                         testing::ValuesIn(kIntraEdgeFilterParamList));
+INSTANTIATE_TEST_SUITE_P(NEON, IntraEdgeUpsamplerTest10bpp,
+                         testing::ValuesIn(kIntraEdgeUpsampleSizes));
+#endif
+
+#endif
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc
new file mode 100644
index 0000000..75af279
--- /dev/null
+++ b/src/dsp/intrapred.cc
@@ -0,0 +1,1437 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct IntraPredFuncs_C {
+  IntraPredFuncs_C() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+  static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+};
+
+// Intra-predictors that require bitdepth.
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+struct IntraPredBppFuncs_C {
+  IntraPredBppFuncs_C() = delete;
+
+  static void DcFill(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+};
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C::DcPred
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcTop(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+  int sum = block_width >> 1;  // rounder
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  const int dc = sum >> FloorLog2(block_width);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::DcLeft(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  int sum = block_height >> 1;  // rounder
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+  const int dc = sum >> FloorLog2(block_height);
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+// Note for square blocks the divide in the Dc() function reduces to a shift.
+// For rectangular block sizes the following multipliers can be used with the
+// corresponding shifts.
+// 8-bit
+//  1:2 (e.g,, 4x8):  scale = 0x5556
+//  1:4 (e.g., 4x16): scale = 0x3334
+//  final_descale = 16
+// 10/12-bit
+//  1:2: scale = 0xaaab
+//  1:4: scale = 0x6667
+//  final_descale = 17
+//  Note these may be halved to the values used in 8-bit in all cases except
+//  when bitdepth == 12 and block_width + block_height is divisible by 5 (as
+//  opposed to 3).
+//
+// The calculation becomes:
+//  (dc_sum >> intermediate_descale) * scale) >> final_descale
+// where intermediate_descale is:
+// sum = block_width + block_height
+// intermediate_descale =
+//     (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5
+//
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum = block width + block height
+// - Shift 'sum' right until we reach an odd number
+// - Let the number of shifts for that block size be called 'intermediate_scale'
+//   and let the odd number be 'd' (d has only 2 possible values: d = 3 for a
+//   1:2 rectangular block and d = 5 for a 1:4 rectangular block).
+// - Find multipliers by dividing by 'd' using "Algorithm 1" in:
+//   http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+//   by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+//   shift will be 16, regardless of the block size.
+// TODO(jzern): the base implementation could be updated to use this method.
+
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Dc(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const int divisor = block_width + block_height;
+  int sum = divisor >> 1;  // rounder
+
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  for (int x = 0; x < block_width; ++x) sum += top[x];
+  for (int y = 0; y < block_height; ++y) sum += left[y];
+
+  const int dc = sum / divisor;
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, dc, block_width);
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredFuncs_C directional predictors
+
+// IntraPredFuncs_C::Vertical -- apply top row vertically
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Vertical(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const void* /*left_column*/) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < block_height; ++y) {
+    memcpy(dst, top_row, block_width * sizeof(Pixel));
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::Horizontal -- apply left column horizontally
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Horizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, left[y], block_width);
+    dst += stride;
+  }
+}
+
+// IntraPredFuncs_C::Paeth
+template <int block_width, int block_height, typename Pixel>
+void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_left = top[-1];
+  const int top_left_x2 = top_left + top_left;
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    const int left_pixel = left[y];
+    for (int x = 0; x < block_width; ++x) {
+      // The Paeth filter selects the value closest to:
+      // top[x] + left[y] - top_left
+      // To calculate the absolute distance for the left value this would be:
+      // abs((top[x] + left[y] - top_left) - left[y])
+      // or, because left[y] cancels out:
+      // abs(top[x] - top_left)
+      const int left_dist = std::abs(top[x] - top_left);
+      const int top_dist = std::abs(left_pixel - top_left);
+      const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2);
+
+      // Select the closest value to the initial estimate of 'T + L - TL'.
+      if (left_dist <= top_dist && left_dist <= top_left_dist) {
+        dst[x] = left_pixel;
+      } else if (top_dist <= top_left_dist) {
+        dst[x] = top[x];
+      } else {
+        dst[x] = top_left;
+      }
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// IntraPredBppFuncs_C
+template <int fill, typename Pixel>
+inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width,
+                     const int block_height) {
+  static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2,
+                "Only 1 & 2 byte pixels are supported");
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int y = 0; y < block_height; ++y) {
+    Memset(dst, fill, block_width);
+    dst += stride;
+  }
+}
+
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void IntraPredBppFuncs_C<block_width, block_height, bitdepth, Pixel>::DcFill(
+    void* const dest, ptrdiff_t stride, const void* /*top_row*/,
+    const void* /*left_column*/) {
+  DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width,
+                                          block_height);
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct IntraPredDefs {
+  IntraPredDefs() = delete;
+
+  using _4x4 = IntraPredFuncs_C<4, 4, Pixel>;
+  using _4x8 = IntraPredFuncs_C<4, 8, Pixel>;
+  using _4x16 = IntraPredFuncs_C<4, 16, Pixel>;
+  using _8x4 = IntraPredFuncs_C<8, 4, Pixel>;
+  using _8x8 = IntraPredFuncs_C<8, 8, Pixel>;
+  using _8x16 = IntraPredFuncs_C<8, 16, Pixel>;
+  using _8x32 = IntraPredFuncs_C<8, 32, Pixel>;
+  using _16x4 = IntraPredFuncs_C<16, 4, Pixel>;
+  using _16x8 = IntraPredFuncs_C<16, 8, Pixel>;
+  using _16x16 = IntraPredFuncs_C<16, 16, Pixel>;
+  using _16x32 = IntraPredFuncs_C<16, 32, Pixel>;
+  using _16x64 = IntraPredFuncs_C<16, 64, Pixel>;
+  using _32x8 = IntraPredFuncs_C<32, 8, Pixel>;
+  using _32x16 = IntraPredFuncs_C<32, 16, Pixel>;
+  using _32x32 = IntraPredFuncs_C<32, 32, Pixel>;
+  using _32x64 = IntraPredFuncs_C<32, 64, Pixel>;
+  using _64x16 = IntraPredFuncs_C<64, 16, Pixel>;
+  using _64x32 = IntraPredFuncs_C<64, 32, Pixel>;
+  using _64x64 = IntraPredFuncs_C<64, 64, Pixel>;
+};
+
+template <int bitdepth, typename Pixel>
+struct IntraPredBppDefs {
+  IntraPredBppDefs() = delete;
+
+  using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>;
+  using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>;
+  using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>;
+  using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>;
+  using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>;
+  using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>;
+  using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>;
+  using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>;
+  using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>;
+  using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>;
+  using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>;
+  using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>;
+  using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>;
+  using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>;
+  using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>;
+  using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>;
+  using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>;
+  using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>;
+  using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>;
+};
+
+using Defs = IntraPredDefs<uint8_t>;
+using Defs8bpp = IntraPredBppDefs<8, uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of
+// the same size.
+#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H)                         \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] =     \
+      DEFSBPP::_##W##x##H::DcFill;                                            \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] =      \
+      DEFS::_##W##x##H::DcTop;                                                \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] =     \
+      DEFS::_##W##x##H::DcLeft;                                               \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] =         \
+      DEFS::_##W##x##H::Dc;                                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] =   \
+      DEFS::_##W##x##H::Vertical;                                             \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
+      DEFS::_##W##x##H::Horizontal;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] =      \
+      DEFS::_##W##x##H::Paeth
+
+#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP)        \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8);   \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8);  \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
+  INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(Defs, Defs8bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs8bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      Defs::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      Defs::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Defs::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      Defs::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Defs::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs8bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      Defs::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      Defs::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Defs::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Defs::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Defs::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs8bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      Defs::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      Defs::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      Defs::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Defs::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Defs::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Defs::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs8bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      Defs::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      Defs::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Defs::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      Defs::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Defs::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs8bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      Defs::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      Defs::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Defs::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Defs::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Defs::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs8bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      Defs::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      Defs::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      Defs::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Defs::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      Defs::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Defs::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs8bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      Defs::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      Defs::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      Defs::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Defs::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Defs::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Defs::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs8bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      Defs::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      Defs::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      Defs::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Defs::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      Defs::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Defs::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs8bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      Defs::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      Defs::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      Defs::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Defs::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Defs::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Defs::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs8bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      Defs::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      Defs::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      Defs::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Defs::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      Defs::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Defs::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs8bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      Defs::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      Defs::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      Defs::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Defs::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      Defs::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Defs::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs8bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      Defs::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      Defs::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      Defs::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Defs::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      Defs::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Defs::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs8bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      Defs::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      Defs::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      Defs::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Defs::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      Defs::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Defs::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs8bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      Defs::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      Defs::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      Defs::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Defs::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      Defs::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Defs::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs8bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      Defs::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      Defs::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      Defs::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Defs::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      Defs::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Defs::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs8bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      Defs::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      Defs::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      Defs::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Defs::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Defs::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Defs::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs8bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      Defs::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      Defs::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      Defs::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Defs::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      Defs::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Defs::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs8bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      Defs::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      Defs::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      Defs::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Defs::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      Defs::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Defs::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs8bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      Defs::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      Defs::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      Defs::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Defs::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      Defs::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Defs::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = IntraPredDefs<uint16_t>;
+using Defs10bpp = IntraPredBppDefs<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
+      Defs10bpp::_4x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DefsHbd::_4x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DefsHbd::_4x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DefsHbd::_4x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      DefsHbd::_4x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DefsHbd::_4x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      DefsHbd::_4x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
+      Defs10bpp::_4x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DefsHbd::_4x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DefsHbd::_4x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DefsHbd::_4x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      DefsHbd::_4x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DefsHbd::_4x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      DefsHbd::_4x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
+      Defs10bpp::_4x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DefsHbd::_4x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DefsHbd::_4x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DefsHbd::_4x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      DefsHbd::_4x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DefsHbd::_4x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      DefsHbd::_4x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
+      Defs10bpp::_8x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DefsHbd::_8x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DefsHbd::_8x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DefsHbd::_8x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      DefsHbd::_8x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DefsHbd::_8x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      DefsHbd::_8x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
+      Defs10bpp::_8x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DefsHbd::_8x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DefsHbd::_8x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DefsHbd::_8x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      DefsHbd::_8x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DefsHbd::_8x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      DefsHbd::_8x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
+      Defs10bpp::_8x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DefsHbd::_8x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DefsHbd::_8x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DefsHbd::_8x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      DefsHbd::_8x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DefsHbd::_8x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      DefsHbd::_8x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
+      Defs10bpp::_8x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DefsHbd::_8x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DefsHbd::_8x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DefsHbd::_8x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      DefsHbd::_8x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DefsHbd::_8x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      DefsHbd::_8x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
+      Defs10bpp::_16x4::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DefsHbd::_16x4::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DefsHbd::_16x4::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DefsHbd::_16x4::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      DefsHbd::_16x4::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DefsHbd::_16x4::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      DefsHbd::_16x4::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
+      Defs10bpp::_16x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DefsHbd::_16x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DefsHbd::_16x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DefsHbd::_16x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      DefsHbd::_16x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DefsHbd::_16x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      DefsHbd::_16x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
+      Defs10bpp::_16x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DefsHbd::_16x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DefsHbd::_16x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DefsHbd::_16x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      DefsHbd::_16x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DefsHbd::_16x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      DefsHbd::_16x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
+      Defs10bpp::_16x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DefsHbd::_16x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DefsHbd::_16x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DefsHbd::_16x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      DefsHbd::_16x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DefsHbd::_16x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      DefsHbd::_16x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
+      Defs10bpp::_16x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DefsHbd::_16x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DefsHbd::_16x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DefsHbd::_16x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      DefsHbd::_16x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DefsHbd::_16x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      DefsHbd::_16x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
+      Defs10bpp::_32x8::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DefsHbd::_32x8::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DefsHbd::_32x8::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DefsHbd::_32x8::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      DefsHbd::_32x8::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DefsHbd::_32x8::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      DefsHbd::_32x8::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
+      Defs10bpp::_32x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DefsHbd::_32x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DefsHbd::_32x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DefsHbd::_32x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      DefsHbd::_32x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DefsHbd::_32x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      DefsHbd::_32x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
+      Defs10bpp::_32x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DefsHbd::_32x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DefsHbd::_32x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DefsHbd::_32x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      DefsHbd::_32x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DefsHbd::_32x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      DefsHbd::_32x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
+      Defs10bpp::_32x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DefsHbd::_32x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DefsHbd::_32x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DefsHbd::_32x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      DefsHbd::_32x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DefsHbd::_32x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      DefsHbd::_32x64::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
+      Defs10bpp::_64x16::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DefsHbd::_64x16::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DefsHbd::_64x16::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DefsHbd::_64x16::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      DefsHbd::_64x16::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DefsHbd::_64x16::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      DefsHbd::_64x16::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
+      Defs10bpp::_64x32::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DefsHbd::_64x32::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DefsHbd::_64x32::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DefsHbd::_64x32::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      DefsHbd::_64x32::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DefsHbd::_64x32::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      DefsHbd::_64x32::Paeth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
+      Defs10bpp::_64x64::DcFill;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DefsHbd::_64x64::DcTop;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DefsHbd::_64x64::DcLeft;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DefsHbd::_64x64::Dc;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      DefsHbd::_64x64::Vertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DefsHbd::_64x64::Horizontal;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      DefsHbd::_64x64::Paeth;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_INTRAPREDICTORS_WxH
+#undef INIT_INTRAPREDICTORS
+}  // namespace
+
+void IntraPredInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h
new file mode 100644
index 0000000..2cb625d
--- /dev/null
+++ b/src/dsp/intrapred.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
+void IntraPredInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_H_
diff --git a/src/dsp/intrapred_cfl.cc b/src/dsp/intrapred_cfl.cc
new file mode 100644
index 0000000..0f7f4f2
--- /dev/null
+++ b/src/dsp/intrapred_cfl.cc
@@ -0,0 +1,655 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+    kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<Pixel*>(dest);
+  const int dc = dst[0];
+  stride /= sizeof(Pixel);
+  const int max_value = (1 << bitdepth) - 1;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+                     0, max_value);
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+          int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+                     const int max_luma_width, const int max_luma_height,
+                     const void* LIBGAV1_RESTRICT const source,
+                     ptrdiff_t stride) {
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int sum = 0;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      const ptrdiff_t luma_x =
+          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+      const ptrdiff_t luma_x_next = luma_x + stride;
+      luma[y][x] =
+          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+                                 : 0))
+          << (3 - subsampling_x - subsampling_y);
+      sum += luma[y][x];
+    }
+    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+      src += stride << subsampling_y;
+    }
+  }
+  const int average = RightShiftWithRounding(
+      sum, FloorLog2(block_width) + FloorLog2(block_height));
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      luma[y][x] -= average;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
+  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
+      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+}  // namespace
+
+void IntraPredCflInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_cfl.h b/src/dsp/intrapred_cfl.h
new file mode 100644
index 0000000..4e8a11f
--- /dev/null
+++ b/src/dsp/intrapred_cfl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
diff --git a/src/dsp/intrapred_cfl_test.cc b/src/dsp/intrapred_cfl_test.cc
new file mode 100644
index 0000000..82f1d2f
--- /dev/null
+++ b/src/dsp/intrapred_cfl_test.cc
@@ -0,0 +1,928 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kCflIntraPredName = "kCflIntraPredictor";
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// CflIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class CflIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  CflIntraPredTest() = default;
+  CflIntraPredTest(const CflIntraPredTest&) = delete;
+  CflIntraPredTest& operator=(const CflIntraPredTest&) = delete;
+  ~CflIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredCflInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_cfl_intra_pred_ = dsp->cfl_intra_predictors[tx_size_];
+
+    if (cur_cfl_intra_pred_ == base_cfl_intra_pred_) {
+      cur_cfl_intra_pred_ = nullptr;
+    }
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  CflIntraPredictorFunc base_cfl_intra_pred_;
+  CflIntraPredictorFunc cur_cfl_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                                  const int num_runs) {
+  if (cur_cfl_intra_pred_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const int alpha = rnd(33) - 16;
+  const int dc = rnd(1 << bitdepth);
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  for (int i = 0; i < block_height_; ++i) {
+    for (int j = 0; j < block_width_; ++j) {
+      if (i < kCflLumaBufferStride && j < kCflLumaBufferStride) {
+        luma[i][j] = max_luma - rnd(max_luma << 1);
+      }
+    }
+  }
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma, alpha);
+    elapsed_time += absl::Now() - start;
+  }
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             intra_pred_mem_.dst, sizeof(intra_pred_mem_.dst),
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = ((1 << bitdepth) - 1) << 3;
+  }
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    for (auto& r : intra_pred_mem_.ref_src) r = (1 << bitdepth) - 1;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void CflIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_cfl_intra_pred_ == nullptr) return;
+  int16_t luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+
+  const int max_luma = ((1 << bitdepth) - 1) << 3;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (auto& line : luma_buffer) {
+    for (auto& luma : line) luma = max_luma - rnd(max_luma << 1);
+  }
+  const int dc = rnd(1 << bitdepth);
+  for (auto& r : intra_pred_mem_.ref_src) r = dc;
+  static constexpr int kSaturatedAlpha[] = {-16, 16};
+  for (const int alpha : kSaturatedAlpha) {
+    intra_pred_mem_.Reset(&rnd);
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    base_cfl_intra_pred_(intra_pred_mem_.ref_src, stride, luma_buffer, alpha);
+    cur_cfl_intra_pred_(intra_pred_mem_.dst, stride, luma_buffer, alpha);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Result from optimized version of CFL with alpha "
+                    << alpha << " differs from reference.";
+      break;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+class CflSubsamplerTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  CflSubsamplerTest() = default;
+  CflSubsamplerTest(const CflSubsamplerTest&) = delete;
+  CflSubsamplerTest& operator=(const CflSubsamplerTest&) = delete;
+  ~CflSubsamplerTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredCflInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_cfl_subsampler_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredCflInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredCflInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    cur_cfl_subsampler_ = dsp->cfl_subsamplers[tx_size_][subsampling_type];
+  }
+
+  // This test modifies intra_pred_mem_.
+  void TestSpeed(const char* digest, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  enum SubsamplingType SubsamplingType() const { return subsampling_type; }
+
+  CflSubsamplerFunc base_cfl_subsampler_;
+  CflSubsamplerFunc cur_cfl_subsampler_;
+};
+
+// There is no case where both source and output have lowest height or width
+// when that dimension is subsampled.
+int GetLumaWidth(int block_width, SubsamplingType subsampling_type) {
+  if (block_width == 4) {
+    const int width_shift =
+        static_cast<int>(subsampling_type != kSubsamplingType444);
+    return block_width << width_shift;
+  }
+  return block_width;
+}
+
+int GetLumaHeight(int block_height, SubsamplingType subsampling_type) {
+  if (block_height == 4) {
+    const int height_shift =
+        static_cast<int>(subsampling_type == kSubsamplingType420);
+    return block_height << height_shift;
+  }
+  return block_height;
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestSpeed(
+    const char* const digest, const int num_runs) {
+  // C declines initializing the table in normal circumstances because there are
+  // assembly implementations.
+  if (cur_cfl_subsampler_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  const int width = GetLumaWidth(block_width_, subsampling_type);
+  const int height = GetLumaHeight(block_height_, subsampling_type);
+  Pixel* src = intra_pred_mem_.ref_src;
+#if LIBGAV1_MSAN
+  // Quiet 10bpp CflSubsampler420_NEON() msan warning.
+  memset(src, 0, sizeof(intra_pred_mem_.ref_src));
+#endif
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      src[j] = rnd.RandRange(1 << bitdepth);
+    }
+    src += kMaxBlockSize;
+  }
+  const absl::Time start = absl::Now();
+  int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int run = 0; run < num_runs; ++run) {
+    cur_cfl_subsampler_(luma, width, height, intra_pred_mem_.ref_src, stride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  test_utils::CheckMd5Digest(ToString(tx_size_), kCflIntraPredName, digest,
+                             luma, sizeof(luma), elapsed_time);
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel,
+                       subsampling_type>::TestSaturatedValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int y = 0; y < height; ++y) {
+        Memset(src, (1 << bitdepth) - 1, width);
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, SubsamplingType subsampling_type>
+void CflSubsamplerTest<bitdepth, Pixel, subsampling_type>::TestRandomValues() {
+  if (base_cfl_subsampler_ == nullptr) return;
+  const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int width = GetLumaWidth(block_width_, subsampling_type); width > 0;
+       width -= 8) {
+    for (int height = GetLumaHeight(block_height_, subsampling_type);
+         height > 0; height -= 8) {
+      Pixel* src = intra_pred_mem_.ref_src;
+      for (int i = 0; i < height; ++i) {
+        for (int j = 0; j < width; ++j) {
+          src[j] = rnd.RandRange(1 << bitdepth);
+        }
+        Memset(src + width, 0, kMaxBlockSize - width);
+        src += kMaxBlockSize;
+      }
+      Memset(intra_pred_mem_.ref_src + kMaxBlockSize * height, 0,
+             kMaxBlockSize * (kMaxBlockSize - height));
+
+      int16_t luma_base[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      int16_t luma_cur[kCflLumaBufferStride][kCflLumaBufferStride] = {};
+      base_cfl_subsampler_(luma_base, width, height, intra_pred_mem_.ref_src,
+                           stride);
+      cur_cfl_subsampler_(luma_cur, width, height, intra_pred_mem_.ref_src,
+                          stride);
+      if (!test_utils::CompareBlocks(reinterpret_cast<uint16_t*>(luma_cur[0]),
+                                     reinterpret_cast<uint16_t*>(luma_base[0]),
+                                     block_width_, block_height_,
+                                     kCflLumaBufferStride, kCflLumaBufferStride,
+                                     true)) {
+        FAIL() << "Result from optimized version of CFL subsampler"
+               << " differs from reference. max_luma_width: " << width
+               << " max_luma_height: " << height;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest8bpp = CflIntraPredTest<8, uint8_t>;
+
+const char* GetCflIntraPredDigest8bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "9ea7088e082867fd5ae394ca549fe1ed";
+  static const char* const kDigest4x8 = "323b0b4784b6658da781398e61f2da3d";
+  static const char* const kDigest4x16 = "99eb9c65f227ca7f71dcac24645a4fec";
+  static const char* const kDigest8x4 = "e8e782e31c94f3974b87b93d455262d8";
+  static const char* const kDigest8x8 = "23ab9fb65e7bbbdb985709e115115eb5";
+  static const char* const kDigest8x16 = "52f5add2fc4bbb2ff893148645e95b9c";
+  static const char* const kDigest8x32 = "283fdee9af8afdb76f72dd7339c92c3c";
+  static const char* const kDigest16x4 = "eead35f515b1aa8b5175b283192b86e6";
+  static const char* const kDigest16x8 = "5778e934254eaab04230bc370f64f778";
+  static const char* const kDigest16x16 = "4e8ed38ccba0d62f1213171da2212ed3";
+  static const char* const kDigest16x32 = "61a29bd7699e18ca6ea5641d1d023bfd";
+  static const char* const kDigest32x8 = "7f31607bd4f9ec879aa47f4daf9c7bb0";
+  static const char* const kDigest32x16 = "eb84dfab900fa6a90e132b186b4c6c36";
+  static const char* const kDigest32x32 = "e0ff35d407cb214578d61ef419c94237";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest8bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest8bpp444 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType444>;
+using CflSubsamplerTest8bpp422 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType422>;
+using CflSubsamplerTest8bpp420 =
+    CflSubsamplerTest<8, uint8_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest8bpp(TransformSize tx_size,
+                                       SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8fa98d76cc3ccffcffc0d02dfae052c", "929cf2c23d926b500616797f8b1baf5b",
+      "1d03f091956838e7f2b113aabd8b9da9"};
+  static const char* const kDigests4x8[3] = {
+      "717b84f867f413c87c90a7c5d0125c8c", "6ccd9f48842b1a802e128b46b8f4885d",
+      "68a334f5d2abecbc78562b3280b5fb0c"};
+  static const char* const kDigests4x16[3] = {
+      "ecd1340b7e065dd8807fd9861abb7d99", "042c3fee17df7ef8fb8cef616f212a91",
+      "b0600f0bc3fbfc374bb3628360dcae5c"};
+  static const char* const kDigests8x4[3] = {
+      "4ea5617f4ed8e9edc2fff88d0ab8e53f", "b02288905f218c9f54ce4a472ec7b22e",
+      "3522d3a4dd3839d1a86fb39b31a86d52"};
+  static const char* const kDigests8x8[3] = {
+      "a0488493e6bcdb868713a95f9b4a0091", "ff6c1ac1d94fce63c282ba49186529bf",
+      "082e34ba04d04d7cd6fe408823987602"};
+  static const char* const kDigests8x16[3] = {
+      "e01dd4bb21daaa6e991cd5b1e6f30300", "2a1b13f932e39cc5f561afea9956f47a",
+      "d8d266282cb7123f780bd7266e8f5913"};
+  static const char* const kDigests8x32[3] = {
+      "0fc95e4ab798b95ccd2966ff75028b03", "6bc6e45ef2f664134449342fe76006ff",
+      "d294fb6399edaa267aa167407c0ebccb"};
+  static const char* const kDigests16x4[3] = {
+      "4798c2cf649b786bd153ad88353d52aa", "43a4bfa3b8caf4b72f58c6a1d1054f64",
+      "a928ebbec2db1508c8831a440d82eb98"};
+  static const char* const kDigests16x8[3] = {
+      "736b7f5b603cb34abcbe1b7e69b6ce93", "90422000ab20ecb519e4d277a9b3ea2b",
+      "c8e71c2fddbb850c5a50592ee5975368"};
+  static const char* const kDigests16x16[3] = {
+      "4f15a694966ee50a9e987e9a0aa2423b", "9e31e2f5a7ce7bef738b135755e25dcd",
+      "2ffeed4d592a0455f6d888913969827f"};
+  static const char* const kDigests16x32[3] = {
+      "3a10438bfe17ea39efad20608a0520eb", "79e8e8732a6ffc29dfbb0b3fc29c2883",
+      "185ca976ccbef7fb5f3f8c6aa22d5a79"};
+  static const char* const kDigests32x8[3] = {
+      "683704f08839a15e42603e4977a3e815", "13d311635372aee8998fca1758e75e20",
+      "9847d88eaaa57c086a2e6aed583048d3"};
+  static const char* const kDigests32x16[3] = {
+      "14b6761bf9f1156cf2496f532512aa99", "ee57bb7f0aa2302d29cdc1bfce72d5fc",
+      "a4189655fe714b82eb88cb5092c0ad76"};
+  static const char* const kDigests32x32[3] = {
+      "dcfbe71b70a37418ccb90dbf27f04226", "c578556a584019c1bdc2d0c3b9fd0c88",
+      "db200bc8ccbeacd6a42d6b8e5ad1d931"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest8bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest8bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest8bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest8bpp420, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+//------------------------------------------------------------------------------
+
+using CflIntraPredTest10bpp = CflIntraPredTest<10, uint16_t>;
+
+const char* GetCflIntraPredDigest10bpp(TransformSize tx_size) {
+  static const char* const kDigest4x4 = "b4ca5f6fbb643a94eb05d59976d44c5d";
+  static const char* const kDigest4x8 = "040139b76ee22af05c56baf887d3d43b";
+  static const char* const kDigest4x16 = "4a1d59ace84ff07e68a0d30e9b1cebdd";
+  static const char* const kDigest8x4 = "c2c149cea5fdcd18bfe5c19ec2a8aa90";
+  static const char* const kDigest8x8 = "68ad90bd6f409548fa5551496b7cb0d0";
+  static const char* const kDigest8x16 = "bdc54eff4de8c5d597b03afaa705d3fe";
+  static const char* const kDigest8x32 = "362aebc6d68ff0d312d55dcd6a8a927d";
+  static const char* const kDigest16x4 = "349e813aedd211581c5e64ba1938eaa7";
+  static const char* const kDigest16x8 = "35c64f6da17f836618b5804185cf3eef";
+  static const char* const kDigest16x16 = "95be0c78dbd8dda793c62c6635b4bfb7";
+  static const char* const kDigest16x32 = "4752b9eda069854d3f5c56d3f2057e79";
+  static const char* const kDigest32x8 = "dafc5e973e4b6a55861f4586a11b7dd1";
+  static const char* const kDigest32x16 = "1e177ed3914a165183916aca1d01bb74";
+  static const char* const kDigest32x32 = "4c9ab3cf9baa27bb34e29729dabc1ea6";
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigest4x4;
+    case kTransformSize4x8:
+      return kDigest4x8;
+    case kTransformSize4x16:
+      return kDigest4x16;
+    case kTransformSize8x4:
+      return kDigest8x4;
+    case kTransformSize8x8:
+      return kDigest8x8;
+    case kTransformSize8x16:
+      return kDigest8x16;
+    case kTransformSize8x32:
+      return kDigest8x32;
+    case kTransformSize16x4:
+      return kDigest16x4;
+    case kTransformSize16x8:
+      return kDigest16x8;
+    case kTransformSize16x16:
+      return kDigest16x16;
+    case kTransformSize16x32:
+      return kDigest16x32;
+    case kTransformSize32x8:
+      return kDigest32x8;
+    case kTransformSize32x16:
+      return kDigest32x16;
+    case kTransformSize32x32:
+      return kDigest32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), num_runs);
+}
+
+TEST_P(CflIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetCflIntraPredDigest10bpp(tx_size_), 1);
+}
+
+TEST_P(CflIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+using CflSubsamplerTest10bpp444 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType444>;
+using CflSubsamplerTest10bpp422 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType422>;
+using CflSubsamplerTest10bpp420 =
+    CflSubsamplerTest<10, uint16_t, kSubsamplingType420>;
+
+const char* GetCflSubsamplerDigest10bpp(TransformSize tx_size,
+                                        SubsamplingType subsampling_type) {
+  static const char* const kDigests4x4[3] = {
+      "a8abcad9a6c9b046a100689135a108cb", "01081c2a0d0c15dabdbc725be5660451",
+      "93d1d9df2861240d88f5618e42178654"};
+  static const char* const kDigests4x8[3] = {
+      "d1fd8cd0709ca6634ad85f3e331672e1", "0d603fcc910aca3db41fc7f64e826c27",
+      "cf88b6d1b7b025cfa0082361775aeb75"};
+  static const char* const kDigests4x16[3] = {
+      "ce2e036a950388a564d8637b1416a6c6", "6c36c46cd72057a6b36bc12188b6d22c",
+      "0884a0e53384cd5173035ad8966d8f2f"};
+  static const char* const kDigests8x4[3] = {
+      "174e961983ed71fb105ed71aa3f9daf5", "330946cc369a534618a1014b4e3f6f18",
+      "8070668aa389c1d09f8aaf43c1223e8c"};
+  static const char* const kDigests8x8[3] = {
+      "86884feb35217010f73ccdbadecb635e", "b8cbc646e1bf1352e5b4b599eaef1193",
+      "4a1110382e56b42d3b7a4132bccc01ee"};
+  static const char* const kDigests8x16[3] = {
+      "a694c4e1f89648ffb49efd6a1d35b300", "864b9da67d23a2f8284b28b2a1e5aa30",
+      "bd012ca1cea256dd02c231339a4cf200"};
+  static const char* const kDigests8x32[3] = {
+      "60c42201bc24e518c1a3b3b6306d8125", "4d530e47c2b7555d5f311ee910d61842",
+      "71888b17b832ef55c0cd9449c0e6b077"};
+  static const char* const kDigests16x4[3] = {
+      "6b6d5ae4cc294c070ce65ab31c5a7d4f", "0fbecee20d294939e7a0183c2b4a0b96",
+      "917cd884923139d5c05a11000722e3b6"};
+  static const char* const kDigests16x8[3] = {
+      "688c41726d9ac35fb5b18c57bca76b9c", "d439a2e0a60d672b644cd1189e2858b9",
+      "edded6d166a77a6c3ff46fddc13f372f"};
+  static const char* const kDigests16x16[3] = {
+      "feb2bad9f6bb3f60eaeaf6c1bfd89ca5", "d65cabce5fcd9a29d1dfc530e4764f3a",
+      "2f1a91898812d2c9320c7506b3a72eb4"};
+  static const char* const kDigests16x32[3] = {
+      "6f23b1851444d29633e62ce77bf09559", "4a449fd078bd0c9657cdc24b709c0796",
+      "e44e18cb8bda2d34b52c96d5b6b510be"};
+  static const char* const kDigests32x8[3] = {
+      "77bf9ba56f7e1d2f04068a8a00b139da", "a85a1dea82963dedab9a2f7ad4169b5f",
+      "d12746071bee96ddc075c6368bc9fbaf"};
+  static const char* const kDigests32x16[3] = {
+      "cce3422f7f8cf57145f979359ac92f98", "1c18738d40bfa91296e5fdb7230bf9a7",
+      "02513142d109aee10f081cacfb33d1c5"};
+  static const char* const kDigests32x32[3] = {
+      "789008e49d0276de186af968196dd4a7", "b8848b00968a7ba4787765b7214da05f",
+      "12d13828db57605b00ce99469489651d"};
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4[subsampling_type];
+    case kTransformSize4x8:
+      return kDigests4x8[subsampling_type];
+    case kTransformSize4x16:
+      return kDigests4x16[subsampling_type];
+    case kTransformSize8x4:
+      return kDigests8x4[subsampling_type];
+    case kTransformSize8x8:
+      return kDigests8x8[subsampling_type];
+    case kTransformSize8x16:
+      return kDigests8x16[subsampling_type];
+    case kTransformSize8x32:
+      return kDigests8x32[subsampling_type];
+    case kTransformSize16x4:
+      return kDigests16x4[subsampling_type];
+    case kTransformSize16x8:
+      return kDigests16x8[subsampling_type];
+    case kTransformSize16x16:
+      return kDigests16x16[subsampling_type];
+    case kTransformSize16x32:
+      return kDigests16x32[subsampling_type];
+    case kTransformSize32x8:
+      return kDigests32x8[subsampling_type];
+    case kTransformSize32x16:
+      return kDigests32x16[subsampling_type];
+    case kTransformSize32x32:
+      return kDigests32x32[subsampling_type];
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(CflSubsamplerTest10bpp444, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp444, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp444, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp422, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp422, Random) { TestRandomValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), num_runs);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, FixedInput) {
+  TestSpeed(GetCflSubsamplerDigest10bpp(tx_size_, SubsamplingType()), 1);
+}
+
+TEST_P(CflSubsamplerTest10bpp420, Overflow) { TestSaturatedValues(); }
+
+TEST_P(CflSubsamplerTest10bpp420, Random) { TestRandomValues(); }
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Cfl predictors are available only for transform sizes with
+// max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest8bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp422,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(C, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(SSE41, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, CflIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp444,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+INSTANTIATE_TEST_SUITE_P(NEON, CflSubsamplerTest10bpp420,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.cc b/src/dsp/intrapred_directional.cc
new file mode 100644
index 0000000..21a40b5
--- /dev/null
+++ b/src/dsp/intrapred_directional.cc
@@ -0,0 +1,249 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row, const int width,
+    const int height, const int xstep, const bool upsampled_top) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+
+  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+  if (xstep == 64) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+    // |predictor_angle| is 45 the delta is also 45.
+    assert(!upsampled_top);
+    const Pixel* top_ptr = top + 1;
+    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+    }
+    return;
+  }
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        Memset(dst, top[max_base_x], width);
+        dst += stride;
+      }
+      return;
+    }
+
+    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+    int x = 0;
+    do {
+      if (top_base_x >= max_base_x) {
+        Memset(dst + x, top[max_base_x], width - x);
+        break;
+      }
+
+      const int val =
+          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+      top_base_x += base_step;
+    } while (++x < width);
+
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int xstep, const int ystep,
+    const bool upsampled_top, const bool upsampled_left) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+  assert(ystep > 0);
+
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int scale_bits_x = 6 - upsample_top_shift;
+  const int scale_bits_y = 6 - upsample_left_shift;
+  const int min_base_x = -(1 << upsample_top_shift);
+  const int base_step_x = 1 << upsample_top_shift;
+  int y = 0;
+  int top_x = -xstep;
+  do {
+    int top_base_x = top_x >> scale_bits_x;
+    int left_y = (y << 6) - ystep;
+    int x = 0;
+    do {
+      int val;
+      if (top_base_x >= min_base_x) {
+        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      } else {
+        // Note this assumes an arithmetic shift to handle negative values.
+        const int left_base_y = left_y >> scale_bits_y;
+        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+        assert(left_base_y >= -(1 << upsample_left_shift));
+        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      }
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step_x;
+      left_y -= ystep;
+    } while (++x < width);
+
+    top_x -= xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const left_column, const int width,
+    const int height, const int ystep, const bool upsampled_left) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  stride /= sizeof(Pixel);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             base_step * (height - 1));  // left_base_y
+
+  int left_y = ystep;
+  int x = 0;
+  do {
+    auto* dst = static_cast<Pixel*>(dest);
+
+    int left_base_y = left_y >> scale_bits;
+    int y = 0;
+    do {
+      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+      const int val =
+          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      dst += stride;
+      left_base_y += base_step;
+    } while (++y < height);
+
+    left_y += ystep;
+  } while (++x < width);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredDirectionalInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_directional.h b/src/dsp/intrapred_directional.h
new file mode 100644
index 0000000..bcd1bc1
--- /dev/null
+++ b/src/dsp/intrapred_directional.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
diff --git a/src/dsp/intrapred_directional_test.cc b/src/dsp/intrapred_directional_test.cc
new file mode 100644
index 0000000..9e98242
--- /dev/null
+++ b/src/dsp/intrapred_directional_test.cc
@@ -0,0 +1,951 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+constexpr int kNumDirectionalIntraPredictors = 3;
+
+constexpr int kBaseAngles[] = {45, 67, 90, 113, 135, 157, 180, 203};
+
+const char* const kDirectionalPredNames[kNumDirectionalIntraPredictors] = {
+    "kDirectionalIntraPredictorZone1", "kDirectionalIntraPredictorZone2",
+    "kDirectionalIntraPredictorZone3"};
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  EXPECT_GE(angle, 3);
+  EXPECT_LE(angle, 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+#if LIBGAV1_MSAN
+      // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+      // assembly code (safely) overreading to fill a register.
+      memset(left_mem, 0, sizeof(left_mem));
+      memset(top_mem, 0, sizeof(top_mem));
+#endif  // LIBGAV1_MSAN
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+#if LIBGAV1_MSAN
+      // Match the behavior of Tile::IntraPrediction to prevent warnings due to
+      // assembly code (safely) overreading to fill a register.
+      memset(left_mem, 0, sizeof(left_mem));
+      memset(top_mem, 0, sizeof(top_mem));
+#endif  // LIBGAV1_MSAN
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// DirectionalIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class DirectionalIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  DirectionalIntraPredTest() = default;
+  DirectionalIntraPredTest(const DirectionalIntraPredTest&) = delete;
+  DirectionalIntraPredTest& operator=(const DirectionalIntraPredTest&) = delete;
+  ~DirectionalIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  enum Zone { kZone1, kZone2, kZone3, kNumZones };
+
+  enum { kAngleDeltaStart = -9, kAngleDeltaStop = 9, kAngleDeltaStep = 3 };
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredDirectionalInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    base_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    base_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_directional_intra_pred_zone1_ = nullptr;
+      base_directional_intra_pred_zone2_ = nullptr;
+      base_directional_intra_pred_zone3_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredDirectionalInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredDirectionalInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    cur_directional_intra_pred_zone1_ = dsp->directional_intra_predictor_zone1;
+    cur_directional_intra_pred_zone2_ = dsp->directional_intra_predictor_zone2;
+    cur_directional_intra_pred_zone3_ = dsp->directional_intra_predictor_zone3;
+
+    // Skip functions that haven't been specialized for this particular
+    // architecture.
+    if (cur_directional_intra_pred_zone1_ ==
+        base_directional_intra_pred_zone1_) {
+      cur_directional_intra_pred_zone1_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone2_ ==
+        base_directional_intra_pred_zone2_) {
+      cur_directional_intra_pred_zone2_ = nullptr;
+    }
+    if (cur_directional_intra_pred_zone3_ ==
+        base_directional_intra_pred_zone3_) {
+      cur_directional_intra_pred_zone3_ = nullptr;
+    }
+  }
+
+  bool IsEdgeUpsampled(int delta, const int filter_type) const {
+    delta = std::abs(delta);
+    if (delta == 0 || delta >= 40) return false;
+    const int block_wh = block_width_ + block_height_;
+    return (filter_type == 1) ? block_wh <= 8 : block_wh <= 16;
+  }
+
+  // Returns the minimum and maximum (exclusive) range of angles that the
+  // predictor should be applied to.
+  void GetZoneAngleRange(const Zone zone, int* const min_angle,
+                         int* const max_angle) const {
+    ASSERT_NE(min_angle, nullptr);
+    ASSERT_NE(max_angle, nullptr);
+    switch (zone) {
+        // The overall minimum angle comes from mode D45_PRED, yielding:
+        // min_angle = 45-(MAX_ANGLE_DELTA*ANGLE_STEP) = 36
+        // The overall maximum angle comes from mode D203_PRED, yielding:
+        // max_angle = 203+(MAX_ANGLE_DELTA*ANGLE_STEP) = 212
+        // The angles 180 and 90 are not permitted because they correspond to
+        // V_PRED and H_PRED, which are handled in distinct functions.
+      case kZone1:
+        *min_angle = 36;
+        *max_angle = 87;
+        break;
+      case kZone2:
+        *min_angle = 93;
+        *max_angle = 177;
+        break;
+      case kZone3:
+        *min_angle = 183;
+        *max_angle = 212;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << zone;
+        break;
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumDirectionalIntraPredictors],
+                 Zone zone, int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  DirectionalIntraPredictorZone1Func base_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func base_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func base_directional_intra_pred_zone3_;
+  DirectionalIntraPredictorZone1Func cur_directional_intra_pred_zone1_;
+  DirectionalIntraPredictorZone2Func cur_directional_intra_pred_zone2_;
+  DirectionalIntraPredictorZone3Func cur_directional_intra_pred_zone3_;
+};
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumDirectionalIntraPredictors], const Zone zone,
+    const int num_runs) {
+  switch (zone) {
+    case kZone1:
+      if (cur_directional_intra_pred_zone1_ == nullptr) return;
+      break;
+    case kZone2:
+      if (cur_directional_intra_pred_zone2_ == nullptr) return;
+      break;
+    case kZone3:
+      if (cur_directional_intra_pred_zone3_ == nullptr) return;
+      break;
+    case kNumZones:
+      FAIL() << "Invalid zone value: " << zone;
+      break;
+  }
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // Allocate separate blocks for each angle + filter + upsampled combination.
+  // Add a 1 pixel right border to test for overwrites.
+  static constexpr int kMaxZoneAngles = 27;  // zone 2
+  static constexpr int kMaxFilterTypes = 2;
+  static constexpr int kBlockBorder = 1;
+  static constexpr int kBorderSize =
+      kBlockBorder * kMaxZoneAngles * kMaxFilterTypes;
+  const int ref_stride =
+      kMaxZoneAngles * kMaxFilterTypes * block_width_ + kBorderSize;
+  const size_t ref_alloc_size = sizeof(Pixel) * ref_stride * block_height_;
+
+  using AlignedPtr = std::unique_ptr<Pixel[], decltype(&AlignedFree)>;
+  AlignedPtr ref_src(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                     &AlignedFree);
+  AlignedPtr dest(static_cast<Pixel*>(AlignedAlloc(16, ref_alloc_size)),
+                  &AlignedFree);
+  ASSERT_NE(ref_src, nullptr);
+  ASSERT_NE(dest, nullptr);
+
+  const int mask = (1 << bitdepth) - 1;
+  for (size_t i = 0; i < ref_alloc_size / sizeof(ref_src[0]); ++i) {
+    ref_src[i] = rnd.Rand16() & mask;
+  }
+
+  int min_angle = 0, max_angle = 0;
+  ASSERT_NO_FATAL_FAILURE(GetZoneAngleRange(zone, &min_angle, &max_angle));
+
+  absl::Duration elapsed_time;
+  for (int run = 0; run < num_runs; ++run) {
+    Pixel* dst = dest.get();
+    memcpy(dst, ref_src.get(), ref_alloc_size);
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle < min_angle || predictor_angle > max_angle) {
+            continue;
+          }
+
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = ref_stride * sizeof(ref_src[0]);
+          if (predictor_angle < 90) {
+            ASSERT_EQ(zone, kZone1);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone1_(dst, stride, top, block_width_,
+                                              block_height_, xstep,
+                                              upsampled_top);
+            elapsed_time += absl::Now() - start;
+          } else if (predictor_angle < 180) {
+            ASSERT_EQ(zone, kZone2);
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone2_(
+                dst, stride, top, left, block_width_, block_height_, xstep,
+                ystep, upsampled_top, upsampled_left);
+            elapsed_time += absl::Now() - start;
+          } else {
+            ASSERT_EQ(zone, kZone3);
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            const absl::Time start = absl::Now();
+            cur_directional_intra_pred_zone3_(dst, stride, left, block_width_,
+                                              block_height_, ystep,
+                                              upsampled_left);
+            elapsed_time += absl::Now() - start;
+          }
+          dst += block_width_ + kBlockBorder;
+        }
+      }
+    }
+  }
+
+  test_utils::CheckMd5Digest(ToString(tx_size_), kDirectionalPredNames[zone],
+                             digests[zone], dest.get(), ref_alloc_size,
+                             elapsed_time);
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    switch (i) {
+      case kZone1:
+        if (cur_directional_intra_pred_zone1_ == nullptr) continue;
+        break;
+      case kZone2:
+        if (cur_directional_intra_pred_zone2_ == nullptr) continue;
+        break;
+      case kZone3:
+        if (cur_directional_intra_pred_zone3_ == nullptr) continue;
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+        for (int angle_delta = kAngleDeltaStart; angle_delta <= kAngleDeltaStop;
+             angle_delta += kAngleDeltaStep) {
+          const int predictor_angle = base_angle + angle_delta;
+          if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+            continue;
+          }
+          ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                        << " angle_delta: " << angle_delta;
+
+          memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                 sizeof(intra_pred_mem_.dst));
+
+          const bool upsampled_left =
+              IsEdgeUpsampled(predictor_angle - 180, filter_type);
+          const bool upsampled_top =
+              IsEdgeUpsampled(predictor_angle - 90, filter_type);
+          const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+          if (predictor_angle < 90) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle);
+            cur_directional_intra_pred_zone1_(intra_pred_mem_.dst, stride, top,
+                                              block_width_, block_height_,
+                                              xstep, upsampled_top);
+          } else if (predictor_angle < 180) {
+            const int xstep =
+                GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+            cur_directional_intra_pred_zone2_(
+                intra_pred_mem_.dst, stride, top, left, block_width_,
+                block_height_, xstep, ystep, upsampled_top, upsampled_left);
+          } else {
+            ASSERT_LT(predictor_angle, 270);
+            const int ystep =
+                GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+            cur_directional_intra_pred_zone3_(intra_pred_mem_.dst, stride, left,
+                                              block_width_, block_height_,
+                                              ystep, upsampled_left);
+          }
+
+          if (!test_utils::CompareBlocks(
+                  intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                  block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+            ADD_FAILURE() << "Expected " << kDirectionalPredNames[i]
+                          << " (angle: " << predictor_angle
+                          << " filter type: " << filter_type
+                          << ") to produce a block containing '"
+                          << static_cast<int>(kMaxPixel) << "'";
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void DirectionalIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+
+  for (int i = kZone1; i < kNumZones; ++i) {
+    // Only run when there is a reference version (base) and a different
+    // optimized version (cur).
+    switch (i) {
+      case kZone1:
+        if (base_directional_intra_pred_zone1_ == nullptr ||
+            cur_directional_intra_pred_zone1_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone2:
+        if (base_directional_intra_pred_zone2_ == nullptr ||
+            cur_directional_intra_pred_zone2_ == nullptr) {
+          continue;
+        }
+        break;
+      case kZone3:
+        if (base_directional_intra_pred_zone3_ == nullptr ||
+            cur_directional_intra_pred_zone3_ == nullptr) {
+          continue;
+        }
+        break;
+      case kNumZones:
+        FAIL() << "Invalid zone value: " << i;
+        break;
+    }
+    int min_angle = 0, max_angle = 0;
+    ASSERT_NO_FATAL_FAILURE(
+        GetZoneAngleRange(static_cast<Zone>(i), &min_angle, &max_angle));
+
+    for (const auto& base_angle : kBaseAngles) {
+      for (int n = 0; n < 1000; ++n) {
+        for (int filter_type = 0; filter_type <= 1; ++filter_type) {
+          for (int angle_delta = kAngleDeltaStart;
+               angle_delta <= kAngleDeltaStop; angle_delta += kAngleDeltaStep) {
+            const int predictor_angle = base_angle + angle_delta;
+            if (predictor_angle <= min_angle || predictor_angle >= max_angle) {
+              continue;
+            }
+            ASSERT_GT(predictor_angle, 0) << "base_angle: " << base_angle
+                                          << " angle_delta: " << angle_delta;
+
+            intra_pred_mem_.Reset(&rnd);
+            memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                   sizeof(intra_pred_mem_.dst));
+
+            const bool upsampled_left =
+                IsEdgeUpsampled(predictor_angle - 180, filter_type);
+            const bool upsampled_top =
+                IsEdgeUpsampled(predictor_angle - 90, filter_type);
+            const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+            if (predictor_angle < 90) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle);
+              base_directional_intra_pred_zone1_(
+                  intra_pred_mem_.ref_src, stride, top, block_width_,
+                  block_height_, xstep, upsampled_top);
+              cur_directional_intra_pred_zone1_(
+                  intra_pred_mem_.dst, stride, top, block_width_, block_height_,
+                  xstep, upsampled_top);
+            } else if (predictor_angle < 180) {
+              const int xstep =
+                  GetDirectionalIntraPredictorDerivative(180 - predictor_angle);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(predictor_angle - 90);
+              base_directional_intra_pred_zone2_(
+                  intra_pred_mem_.ref_src, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+              cur_directional_intra_pred_zone2_(
+                  intra_pred_mem_.dst, stride, top, left, block_width_,
+                  block_height_, xstep, ystep, upsampled_top, upsampled_left);
+            } else {
+              ASSERT_LT(predictor_angle, 270);
+              const int ystep =
+                  GetDirectionalIntraPredictorDerivative(270 - predictor_angle);
+              base_directional_intra_pred_zone3_(
+                  intra_pred_mem_.ref_src, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+              cur_directional_intra_pred_zone3_(
+                  intra_pred_mem_.dst, stride, left, block_width_,
+                  block_height_, ystep, upsampled_left);
+            }
+
+            if (!test_utils::CompareBlocks(
+                    intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+                    block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+              ADD_FAILURE() << "Result from optimized version of "
+                            << kDirectionalPredNames[i]
+                            << " differs from reference at angle "
+                            << predictor_angle << " with filter type "
+                            << filter_type << " in iteration #" << n;
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+using DirectionalIntraPredTest8bpp = DirectionalIntraPredTest<8, uint8_t>;
+
+const char* const* GetDirectionalIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "9cfc1da729ad08682e165826c29b280b",
+      "bb73539c7afbda7bddd2184723b932d6",
+      "9d2882800ffe948196e984a26a2da72c",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "090efe6f83cc6fa301f65d3bbd5c38d2",
+      "d0fba4cdfb90f8bd293a94cae9db1a15",
+      "f7ad0eeab4389d0baa485d30fec87617",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "1d32b33c75fe85248c48cdc8caa78d84",
+      "7000e18159443d366129a6cc6ef8fcee",
+      "06c02fac5f8575f687abb3f634eb0b4c",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "1b591799685bc135982114b731293f78",
+      "5cd9099acb9f7b2618dafa6712666580",
+      "d023883efede88f99c19d006044d9fa1",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "f1e46ecf62a2516852f30c5025adb7ea",
+      "864442a209c16998065af28d8cdd839a",
+      "411a6e554868982af577de69e53f12e8",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "89278302be913a85cfb06feaea339459",
+      "6c42f1a9493490cd4529fd40729cec3c",
+      "2516b5e1c681e5dcb1acedd5f3d41106",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "aea7078f3eeaa8afbfe6c959c9e676f1",
+      "cad30babf12729dda5010362223ba65c",
+      "ff384ebdc832007775af418a2aae1463",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "964a821c313c831e12f4d32e616c0b55",
+      "adf6dad3a84ab4d16c16eea218bec57a",
+      "a54fa008d43895e523474686c48a81c2",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "fe2851b4e4f9fcf924cf17d50415a4c0",
+      "50a0e279c481437ff315d08eb904c733",
+      "0682065c8fb6cbf9be4949316c87c9e5",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "ef15503b1943642e7a0bace1616c0e11",
+      "bf1a4d3f855f1072a902a88ec6ce0350",
+      "7e87a03e29cd7fd843fd71b729a18f3f",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "f7b636615d2e5bf289b5db452a6f188d",
+      "e95858c532c10d00b0ce7a02a02121dd",
+      "34a18ccf58ef490f32268e85ce8c7de4",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "b250099986c2fab9670748598058846b",
+      "f25d80af4da862a9b6b72979f1e17cb4",
+      "5347dc7bc346733b4887f6c8ad5e0898",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "72e4c9f8af043b1cb1263490351818ab",
+      "1fc010d2df011b9e4e3d0957107c78df",
+      "f4cbfa3ca941ef08b972a68d7e7bafc4",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "37e5a1aaf7549d2bce08eece9d20f0f6",
+      "6a2794025d0aca414ab17baa3cf8251a",
+      "63dd37a6efdc91eeefef166c99ce2db1",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "198aabc958992eb49cceab97d1acb43e",
+      "aee88b6c8bacfcf38799fe338e6c66e7",
+      "01e8f8f96696636f6d79d33951907a16",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "0611390202c4f90f7add7aec763ded58",
+      "960240c7ceda2ccfac7c90b71460578a",
+      "7e7d97594aab8ad56e8c01c340335607",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "7e1f567e7fc510757f2d89d638bc826f",
+      "c929d687352ce40a58670be2ce3c8c90",
+      "f6881e6a9ba3c3d3d730b425732656b1",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "27b4c2a7081d4139f22003ba8b6dfdf2",
+      "301e82740866b9274108a04c872fa848",
+      "98d3aa4fef838f4abf00dac33806659f",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b31816db8fade3accfd975b21aa264c7",
+      "2adce01a03b9452633d5830e1a9b4e23",
+      "7b988fadba8b07c36e88d7be6b270494",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+  const auto num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests8bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+using DirectionalIntraPredTest10bpp = DirectionalIntraPredTest<10, uint16_t>;
+
+const char* const* GetDirectionalIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumDirectionalIntraPredictors] = {
+      "a683f4d7ccd978737615f61ecb4d638d",
+      "90c94374eaf7e9501f197863937b8639",
+      "0d3969cd081523ac6a906eecc7980c43",
+  };
+  static const char* const kDigests4x8[kNumDirectionalIntraPredictors] = {
+      "c3ffa2979b325644e4a56c882fe27347",
+      "1f61f5ee413a9a3b8d1d93869ec2aee0",
+      "4795ea944779ec4a783408769394d874",
+  };
+  static const char* const kDigests4x16[kNumDirectionalIntraPredictors] = {
+      "45c3282c9aa51024c1d64a40f230aa45",
+      "5cd47dd69f8bd0b15365a0c5cfc0a49a",
+      "06336c507b05f98c1d6a21abc43e6182",
+  };
+  static const char* const kDigests8x4[kNumDirectionalIntraPredictors] = {
+      "7370476ff0abbdc5e92f811b8879c861",
+      "a239a50adb28a4791b52a0dfff3bee06",
+      "4779a17f958a9ca04e8ec08c5aba1d36",
+  };
+  static const char* const kDigests8x8[kNumDirectionalIntraPredictors] = {
+      "305463f346c376594f82aad8304e0362",
+      "0cd481e5bda286c87a645417569fd948",
+      "48c7899dc9b7163b0b1f61b3a2b4b73e",
+  };
+  static const char* const kDigests8x16[kNumDirectionalIntraPredictors] = {
+      "5c18fd5339be90628c82b1fb6af50d5e",
+      "35eaa566ebd3bb7c903cfead5dc9ac78",
+      "9fdb0e790e5965810d02c02713c84071",
+  };
+  static const char* const kDigests8x32[kNumDirectionalIntraPredictors] = {
+      "2168d6cc858c704748b7b343ced2ac3a",
+      "1d3ce273107447faafd2e55877e48ffb",
+      "d344164049d1fe9b65a3ae8764bbbd37",
+  };
+  static const char* const kDigests16x4[kNumDirectionalIntraPredictors] = {
+      "dcef2cf51abe3fe150f388a14c762d30",
+      "6a810b289b1c14f8eab8ca1274e91ecd",
+      "c94da7c11f3fb11963d85c8804fce2d9",
+  };
+  static const char* const kDigests16x8[kNumDirectionalIntraPredictors] = {
+      "50a0d08b0d99b7a574bad2cfb36efc39",
+      "2dcb55874db39da70c8ca1318559f9fe",
+      "6390bcd30ff3bc389ecc0a0952bea531",
+  };
+  static const char* const kDigests16x16[kNumDirectionalIntraPredictors] = {
+      "7146c83c2620935606d49f3cb5876f41",
+      "2318ddf30c070a53c9b9cf199cd1b2c5",
+      "e9042e2124925aa7c1b6110617cb10e8",
+  };
+  static const char* const kDigests16x32[kNumDirectionalIntraPredictors] = {
+      "c970f401de7b7c5bb4e3ad447fcbef8f",
+      "a18cc70730eecdaa31dbcf4306ff490f",
+      "32c1528ad4a576a2210399d6b4ccd46e",
+  };
+  static const char* const kDigests16x64[kNumDirectionalIntraPredictors] = {
+      "00b3f0007da2e5d01380594a3d7162d5",
+      "1971af519e4a18967b7311f93efdd1b8",
+      "e6139769ce5a9c4982cfab9363004516",
+  };
+  static const char* const kDigests32x8[kNumDirectionalIntraPredictors] = {
+      "08107ad971179cc9f465ae5966bd4901",
+      "b215212a3c0dfe9182c4f2e903d731f7",
+      "791274416a0da87c674e1ae318b3ce09",
+  };
+  static const char* const kDigests32x16[kNumDirectionalIntraPredictors] = {
+      "94ea6cccae35b5d08799aa003ac08ccf",
+      "ae105e20e63fb55d4fd9d9e59dc62dde",
+      "973d0b2358ea585e4f486e7e645c5310",
+  };
+  static const char* const kDigests32x32[kNumDirectionalIntraPredictors] = {
+      "d14c695c4853ddf5e5d8256bc1d1ed60",
+      "6bd0ebeb53adecc11442b1218b870cb7",
+      "e03bc402a9999aba8272275dce93e89f",
+  };
+  static const char* const kDigests32x64[kNumDirectionalIntraPredictors] = {
+      "b21a8a8723758392ee659eeeae518a1e",
+      "e50285454896210ce44d6f04dfde05a7",
+      "f0f8ea0c6c2acc8d7d390927c3a90370",
+  };
+  static const char* const kDigests64x16[kNumDirectionalIntraPredictors] = {
+      "ce51db16fd4fa56e601631397b098c89",
+      "aa87a8635e02c1e91d13158c61e443f6",
+      "4c1ee3afd46ef34bd711a34d0bf86f13",
+  };
+  static const char* const kDigests64x32[kNumDirectionalIntraPredictors] = {
+      "25aaf5971e24e543e3e69a47254af777",
+      "eb6f444b3df127d69460778ab5bf8fc1",
+      "2f846cc0d506f90c0a58438600819817",
+  };
+  static const char* const kDigests64x64[kNumDirectionalIntraPredictors] = {
+      "b26ce5b5f4b5d4a438b52e5987877fb8",
+      "35721a00a70938111939cf69988d928e",
+      "0af7ec35939483fac82c246a13845806",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, DISABLED_Speed) {
+#if LIBGAV1_ENABLE_NEON
+  const int num_runs = static_cast<int>(2e7 / (block_width_ * block_height_));
+#else
+  const int num_runs = static_cast<int>(4e7 / (block_width_ * block_height_));
+#endif
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), num_runs);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, FixedInput) {
+  for (int i = kZone1; i < kNumZones; ++i) {
+    TestSpeed(GetDirectionalIntraPredDigests10bpp(tx_size_),
+              static_cast<Zone>(i), 1);
+  }
+}
+
+TEST_P(DirectionalIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(DirectionalIntraPredTest10bpp, Random) { TestRandomValues(); }
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, DirectionalIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.cc b/src/dsp/intrapred_filter.cc
new file mode 100644
index 0000000..9a45eff
--- /dev/null
+++ b/src/dsp/intrapred_filter.cc
@@ -0,0 +1,144 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                            const void* LIBGAV1_RESTRICT const top_row,
+                            const void* LIBGAV1_RESTRICT const left_column,
+                            const FilterIntraPredictor pred, const int width,
+                            const int height) {
+  const int kMaxPixel = (1 << bitdepth) - 1;
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
+  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  int row0 = 0, row2 = 2;
+  int ystep = 1;
+  int y = 0;
+  do {
+    buffer[1][0] = left[y];
+    buffer[row2][0] = left[y + 1];
+    int x = 1;
+    do {
+      const Pixel p0 = buffer[row0][x - 1];  // top-left
+      const Pixel p1 = buffer[row0][x + 0];  // top 0
+      const Pixel p2 = buffer[row0][x + 1];  // top 1
+      const Pixel p3 = buffer[row0][x + 2];  // top 2
+      const Pixel p4 = buffer[row0][x + 3];  // top 3
+      const Pixel p5 = buffer[1][x - 1];     // left 0
+      const Pixel p6 = buffer[row2][x - 1];  // left 1
+      for (int i = 0; i < 8; ++i) {
+        const int xoffset = i & 0x03;
+        const int yoffset = (i >> 2) * ystep;
+        const int value = kFilterIntraTaps[pred][i][0] * p0 +
+                          kFilterIntraTaps[pred][i][1] * p1 +
+                          kFilterIntraTaps[pred][i][2] * p2 +
+                          kFilterIntraTaps[pred][i][3] * p3 +
+                          kFilterIntraTaps[pred][i][4] * p4 +
+                          kFilterIntraTaps[pred][i][5] * p5 +
+                          kFilterIntraTaps[pred][i][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
+        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+      }
+      x += 4;
+    } while (x < width);
+    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+    dst += stride;
+
+    // The final row becomes the top for the next pass.
+    row0 ^= 2;
+    row2 ^= 2;
+    ystep = -ystep;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_filter.h b/src/dsp/intrapred_filter.h
new file mode 100644
index 0000000..8146b82
--- /dev/null
+++ b/src/dsp/intrapred_filter.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
diff --git a/src/dsp/intrapred_filter_test.cc b/src/dsp/intrapred_filter_test.cc
new file mode 100644
index 0000000..fe1efdc
--- /dev/null
+++ b/src/dsp/intrapred_filter_test.cc
@@ -0,0 +1,559 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kFilterIntraPredNames[kNumFilterIntraPredictors] = {
+    "kFilterIntraPredictorDc",         "kFilterIntraPredictorVertical",
+    "kFilterIntraPredictorHorizontal", "kFilterIntraPredictorD157",
+    "kFilterIntraPredictorPaeth",
+};
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// FilterIntraPredTest
+
+template <int bitdepth, typename Pixel>
+class FilterIntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  FilterIntraPredTest() = default;
+  FilterIntraPredTest(const FilterIntraPredTest&) = delete;
+  FilterIntraPredTest& operator=(const FilterIntraPredTest&) = delete;
+  ~FilterIntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_filter_intra_pred_ = dsp->filter_intra_predictor;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      // No need to compare C with itself.
+      base_filter_intra_pred_ = nullptr;
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredFilterInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredFilterInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    // Put the current architecture-specific implementation up for testing and
+    // comparison against C version.
+    cur_filter_intra_pred_ = dsp->filter_intra_predictor;
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumFilterIntraPredictors],
+                 int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  FilterIntraPredictorFunc base_filter_intra_pred_;
+  FilterIntraPredictorFunc cur_filter_intra_pred_;
+};
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumFilterIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const Pixel* const left = intra_pred_mem_.left_mem + 16;
+  const Pixel* const top = intra_pred_mem_.top_mem + 16;
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_), kFilterIntraPredNames[i],
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // IntraPredInit_C() leaves the filter function empty.
+  if (cur_filter_intra_pred_ == nullptr) return;
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                           static_cast<FilterIntraPredictor>(i), block_width_,
+                           block_height_);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << kFilterIntraPredNames[i]
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void FilterIntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Skip the 'C' test case as this is used as the reference.
+  if (base_filter_intra_pred_ == nullptr) return;
+
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumFilterIntraPredictors; ++i) {
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_filter_intra_pred_(intra_pred_mem_.ref_src, stride, top, left,
+                              static_cast<FilterIntraPredictor>(i),
+                              block_width_, block_height_);
+      cur_filter_intra_pred_(intra_pred_mem_.dst, stride, top, left,
+                             static_cast<FilterIntraPredictor>(i), block_width_,
+                             block_height_);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << kFilterIntraPredNames[i]
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+using FilterIntraPredTest8bpp = FilterIntraPredTest<8, uint8_t>;
+
+const char* const* GetFilterIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "a2486efcfb351d60a8941203073e89c6", "240716ae5ecaedc19edae1bdef49e05d",
+      "dacf4af66a966aca7c75abe24cd9ba99", "311888773676f3c2ae3334c4e0f141e5",
+      "2d3711616c8d8798f608e313cb07a72a",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "1cb74ba1abc68d936e87c13511ed5fbf", "d64c2c08586a762dbdfa8e1150bede06",
+      "73e9d1a9b6fa3e96fbd65c7dce507529", "e3ae17d9338e5aa3420d31d0e2d7ee87",
+      "750dbfe3bc5508b7031957a1d315b8bc",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "48a1060701bf68ec6342d6e24c10ef17", "0c91ff7988814d192ed95e840a87b4bf",
+      "efe586b891c8828c4116c9fbf50850cc", "a3bfa10be2b155826f107e9256ac3ba1",
+      "976273745b94a561fd52f5aa96fb280f",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "73f82633aeb28db1d254d077edefd8a9", "8eee505cdb5828e33b67ff5572445dac",
+      "9b0f101c28c66a916079fe5ed33b4021", "47fd44a7e5a5b55f067908192698e25c",
+      "eab59a3710d9bdeca8fa03a15d3f95d6",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "aa07b7a007c4c1d494ddb44a23c27bcd", "d27eee43f15dfcfe4c46cd46b681983b",
+      "1015d26022cf57acfdb11fd3f6b9ccb0", "4f0e00ef556fbcac2fb31e3b18869070",
+      "918c2553635763a0756b20154096bca6",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a8ac58b2efb02092035cca206dbf5fbe", "0b22b000b7f124b32545bc86dd9f0142",
+      "cd6a08e023cad301c084b6ec2999da63", "c017f5f4fa5c05e7638ae4db98512b13",
+      "893e6995522e23ed3d613ef3797ca580",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "b3d5d4f09b778ae2b8cc0e9014c22320", "e473874a1e65228707489be9ca6477aa",
+      "91bda5a2d32780af345bb3d49324732f", "20f2ff26f004f02e8e2be49e6cadc32f",
+      "00c909b749e36142b133a7357271e83e",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "ef252f074fc3f5367748436e676e78ca", "cd436d8803ea40db3a849e7c869855c7",
+      "9cd8601b5d66e61fd002f8b11bfa58d9", "b982f17ee36ef0d1c2cfea20197d5666",
+      "9e350d1cd65d520194281633f566810d",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "9a7e0cf9b023a89ee619ee672ba2a219", "c20186bc642912ecd4d48bc4924a79b1",
+      "77de044f4c7f717f947a36fc0aa17946", "3f2fc68f11e6ee0220adb8d1ee085c8e",
+      "2f37e586769dfb88d9d4116b9c28c5ab",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "36c5b85b9a6b1d2e8f44f09c81adfe9c", "78494ce3a6a78aa2879ad2e24d43a005",
+      "aa30cd29a74407dbec80161745161eb2", "ae2a0975ef166e05e5e8c3701bd19e93",
+      "6322fba6f3bcb1f6c8e78160d200809c",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "82d54732c37424946bc73f5a78f64641", "071773c82869bb103c31e05f14ed3c2f",
+      "3a0094c150bd6e21ce1f17243b21e76b", "998ffef26fc65333ae407bbe9d41a252",
+      "6491add6b665aafc364c8c104a6a233d",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "c60062105dd727e94f744c35f0d2156e", "36a9e4d543701c4c546016e35e9c4337",
+      "05a8d07fe271023e63febfb44814d114", "0a28606925519d1ed067d64761619dc8",
+      "bb8c34b143910ba49b01d13e94d936ac",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "60e6caeec9194fcb409469e6e1393128", "5d764ead046443eb14f76822a569b056",
+      "b1bf22fcc282614354166fa1eb6e5f8b", "4b188e729fe49ae24100b3ddd8f17313",
+      "75f430fdea0b7b5b66866fd68a795a6a",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "5bb91a37b1979866eb23b59dd352229d", "589aa983109500749609d7be1cb79711",
+      "5e8fb1927cdbe21143494b56b5d400f6", "9e28f741d19c64b2a0577d83546d32d9",
+      "73c73237a5d891096066b186abf96854",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(FilterIntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilterIntraPredTest10bpp = FilterIntraPredTest<10, uint16_t>;
+
+const char* const* GetFilterIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumFilterIntraPredictors] = {
+      "13a9014d9e255cde8e3e85abf6ef5151", "aee33aa3f3baec87a8c019743fff40f1",
+      "fdd8ca2be424501f51fcdb603c2e757c", "aed00c082d1980d4bab45e9318b939f0",
+      "1b363db246aa5400f49479b7d5d41799",
+  };
+  static const char* const kDigests4x8[kNumFilterIntraPredictors] = {
+      "e718b9e31ba3da0392fd4b6cfba5d882", "31ba22989cdc3bb80749685f42c6c697",
+      "6bc5b3a55b94018117569cfdced17bf9", "ec29979fb4936116493dfa1cfc93901c",
+      "c6bcf564e63c42148d9917f089566432",
+  };
+  static const char* const kDigests4x16[kNumFilterIntraPredictors] = {
+      "404bddd88dff2c0414b5398287e54f18", "ff4fb3039cec6c9ffed6d259cbbfd854",
+      "7d6fa3ed9e728ff056a73c40bb6edeb6", "82845d942ad8048578e0037336905146",
+      "f3c07ea65db08c639136a5a9270f95ff",
+  };
+  static const char* const kDigests8x4[kNumFilterIntraPredictors] = {
+      "2008981638f27ba9123973a733e46c3d", "47efecf1f7628cbd8c22e168fcceb5ce",
+      "04c857ffbd1edd6e2788b17410a4a39c", "deb0236c4277b4d7b174fba407e1c9d7",
+      "5b58567f94ae9fa930f700c68c17399d",
+  };
+  static const char* const kDigests8x8[kNumFilterIntraPredictors] = {
+      "d9bab44a6d1373e758bfa0ee88239093", "29b10ddb32d9de2ff0cad6126f010ff6",
+      "1a03f9a18bdbab0811138cd969bf1f93", "e3273c24e77095ffa033a073f5bbcf7b",
+      "5187bb3df943d154cb01fb2f244ff86f",
+  };
+  static const char* const kDigests8x16[kNumFilterIntraPredictors] = {
+      "a2199f792634a56f1c4e88510e408773", "8fd8a98969d19832975ee7131cca9dbb",
+      "d897380941f75b04b1327e63f136d7d6", "d36f52a157027d53b15b7c02a7983436",
+      "0a8c23047b0364f5687b62b01f043359",
+  };
+  static const char* const kDigests8x32[kNumFilterIntraPredictors] = {
+      "5b74ea8e4f60151cf2db9b23d803a2e2", "e0d6bb5fa7d181589c31fcf2755d7c0b",
+      "42e590ffc88b8940b7aade22e13bbb6a", "e47c39ec1761aa7b5a9b1368ede7cfdc",
+      "6e963a89beac6f3a362c269d1017f9a8",
+  };
+  static const char* const kDigests16x4[kNumFilterIntraPredictors] = {
+      "9eaa079622b5dd95ad3a8feb68fa9bbb", "17e3aa6a0034e9eedcfc65b8ce6e7205",
+      "eac5a5337dbaf9bcbc3d320745c8e190", "c6ba9a7e518be04f725bc1dbd399c204",
+      "19020b82ce8bb49a511820c7e1d58e99",
+  };
+  static const char* const kDigests16x8[kNumFilterIntraPredictors] = {
+      "2d2c3255d5dfc1479a5d82a7d5a0d42e", "0fbb4ee851b4ee58c6d30dd820d19e38",
+      "fa77a1b056e8dc8efb702c7832531b32", "186269ca219dc663ad9b4a53e011a54b",
+      "c12180a6dcde0c3579befbb5304ff70b",
+  };
+  static const char* const kDigests16x16[kNumFilterIntraPredictors] = {
+      "dbb81d7ee7d3c83c271400d0160b2e83", "4da656a3ef238d90bb8339471a6fdb7e",
+      "d95006bf299b84a1b04e38d5fa8fb4f7", "742a03331f0fbd66c57df0ae31104aca",
+      "4d20aa440e38b6b7ac83c8c54d313169",
+  };
+  static const char* const kDigests16x32[kNumFilterIntraPredictors] = {
+      "6247730c93789cc25bcb837781dfa05b", "9a93e14b06dd145e35ab21a0353bdebe",
+      "6c5866353e30296a67d9bd7a65d6998d", "389d7f038d7997871745bb1305156ff9",
+      "e7640d81f891e1d06e7da75c6ae74d93",
+  };
+  static const char* const kDigests32x8[kNumFilterIntraPredictors] = {
+      "68f3a603b7c25dd78deffe91aef22834", "48c735e4aa951d6333d99e571bfeadc8",
+      "35239df0993a429fc599a3037c731e4b", "ba7dd72e04af1a1fc1b30784c11df783",
+      "78e9017f7434665d32ec59795aed0012",
+  };
+  static const char* const kDigests32x16[kNumFilterIntraPredictors] = {
+      "8cf2f11f7f77901cb0c522ad191eb998", "204c76d68c5117b89b5c3a05d5548883",
+      "f3751e41e7a595f43d8aaf9a40644e05", "81ea1a7d608d7b91dd3ede0f87e750ee",
+      "b5951334dfbe6229d828e03cd2d98538",
+  };
+  static const char* const kDigests32x32[kNumFilterIntraPredictors] = {
+      "9d8630188c3d1a4f28a6106e343c9380", "c6c92e059faa17163522409b7bf93230",
+      "62e4c959cb06ec661d98769981fbd555", "01e61673f11011571246668e36cc61c5",
+      "4530222ea1de546e202630fcf43f4526",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(FilterIntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.5e8 / (block_width_ * block_height_));
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(FilterIntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetFilterIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(FilterIntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+// Filter-intra and Cfl predictors are available only for transform sizes
+// with max(width, height) <= 32.
+constexpr TransformSize kTransformSizesSmallerThan32x32[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize32x8,
+    kTransformSize32x16, kTransformSize32x32};
+
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, FilterIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilterIntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizesSmallerThan32x32));
+#endif  // LIBGAV1_ENABLE_NEON
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.cc b/src/dsp/intrapred_smooth.cc
new file mode 100644
index 0000000..0c7f272
--- /dev/null
+++ b/src/dsp/intrapred_smooth.cc
@@ -0,0 +1,729 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+  SmoothFuncs_C() = delete;
+
+  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+                             const void* left_column);
+  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+                               const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(
+      block_width >= 4 && block_height >= 4,
+      "Weights for smooth predictor undefined for block width/height < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+      // + 256. With the descale there's no need for saturation.
+      dst[x] = static_cast<Pixel>(
+          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(block_height >= 4,
+                "Weights for smooth predictor undefined for block height < 4");
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  static_assert(block_width >= 4,
+                "Weights for smooth predictor undefined for block width < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_x[x]);
+      uint32_t pred = weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+  SmoothDefs() = delete;
+
+  using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+  using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+  using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+  using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+  using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+  using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+  using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+  using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+  using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+  using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+  using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+  using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+  using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+  using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+  using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+  using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+  using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+  using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+  using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H)                                       \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+      DEFS::_##W##x##H::Smooth;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothVertical] =                  \
+      DEFS::_##W##x##H::SmoothVertical;                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothHorizontal] =                \
+      DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS)        \
+  INIT_SMOOTH_WxH(DEFS, 4, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 32);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 4);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 16); \
+  INIT_SMOOTH_WxH(DEFS, 16, 32); \
+  INIT_SMOOTH_WxH(DEFS, 16, 64); \
+  INIT_SMOOTH_WxH(DEFS, 32, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 32, 16); \
+  INIT_SMOOTH_WxH(DEFS, 32, 32); \
+  INIT_SMOOTH_WxH(DEFS, 32, 64); \
+  INIT_SMOOTH_WxH(DEFS, 64, 16); \
+  INIT_SMOOTH_WxH(DEFS, 64, 32); \
+  INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(Defs);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+}  // namespace
+
+void IntraPredSmoothInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/intrapred_smooth.h b/src/dsp/intrapred_smooth.h
new file mode 100644
index 0000000..6802003
--- /dev/null
+++ b/src/dsp/intrapred_smooth.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
diff --git a/src/dsp/intrapred_test.cc b/src/dsp/intrapred_test.cc
new file mode 100644
index 0000000..335aa2f
--- /dev/null
+++ b/src/dsp/intrapred_test.cc
@@ -0,0 +1,710 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+template <int bitdepth, typename Pixel>
+class IntraPredTestBase : public testing::TestWithParam<TransformSize>,
+                          public test_utils::MaxAlignedAllocable {
+ public:
+  IntraPredTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  IntraPredTestBase(const IntraPredTestBase&) = delete;
+  IntraPredTestBase& operator=(const IntraPredTestBase&) = delete;
+  ~IntraPredTestBase() override = default;
+
+ protected:
+  struct IntraPredMem {
+    void Reset(libvpx_test::ACMRandom* rnd) {
+      ASSERT_NE(rnd, nullptr);
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      const int mask = (1 << bitdepth) - 1;
+      for (auto& r : ref_src) r = rnd->Rand16() & mask;
+      for (int i = 0; i < kMaxBlockSize; ++i) left[i] = rnd->Rand16() & mask;
+      for (int i = -1; i < kMaxBlockSize; ++i) top[i] = rnd->Rand16() & mask;
+
+      // Some directional predictors require top-right, bottom-left.
+      for (int i = kMaxBlockSize; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = rnd->Rand16() & mask;
+        top[i] = rnd->Rand16() & mask;
+      }
+      // TODO(jzern): reorder this and regenerate the digests after switching
+      // random number generators.
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      left[-1] = rnd->Rand16() & mask;
+      left[-2] = rnd->Rand16() & mask;
+      top[-2] = rnd->Rand16() & mask;
+      memset(left_mem, 0, sizeof(left_mem[0]) * 14);
+      memset(top_mem, 0, sizeof(top_mem[0]) * 14);
+      memset(top_mem + kMaxBlockSize * 2 + 16, 0,
+             sizeof(top_mem[0]) * kTopMemPadding);
+    }
+
+    // Set ref_src, top-left, top and left to |pixel|.
+    void Set(const Pixel pixel) {
+      Pixel* const left = left_mem + 16;
+      Pixel* const top = top_mem + 16;
+      for (auto& r : ref_src) r = pixel;
+      // Upsampling in the directional predictors extends left/top[-1] to [-2].
+      for (int i = -2; i < 2 * kMaxBlockSize; ++i) {
+        left[i] = top[i] = pixel;
+      }
+    }
+
+    // DirectionalZone1_Large() overreads up to 7 pixels in |top_mem|.
+    static constexpr int kTopMemPadding = 7;
+    alignas(kMaxAlignment) Pixel dst[kTotalPixels];
+    alignas(kMaxAlignment) Pixel ref_src[kTotalPixels];
+    alignas(kMaxAlignment) Pixel left_mem[kMaxBlockSize * 2 + 16];
+    alignas(
+        kMaxAlignment) Pixel top_mem[kMaxBlockSize * 2 + 16 + kTopMemPadding];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  IntraPredMem intra_pred_mem_;
+};
+
+//------------------------------------------------------------------------------
+// IntraPredTest
+
+template <int bitdepth, typename Pixel>
+class IntraPredTest : public IntraPredTestBase<bitdepth, Pixel> {
+ public:
+  IntraPredTest() = default;
+  IntraPredTest(const IntraPredTest&) = delete;
+  IntraPredTest& operator=(const IntraPredTest&) = delete;
+  ~IntraPredTest() override = default;
+
+ protected:
+  using IntraPredTestBase<bitdepth, Pixel>::tx_size_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_width_;
+  using IntraPredTestBase<bitdepth, Pixel>::block_height_;
+  using IntraPredTestBase<bitdepth, Pixel>::intra_pred_mem_;
+
+  void SetUp() override {
+    IntraPredTestBase<bitdepth, Pixel>::SetUp();
+    IntraPredInit_C();
+    IntraPredSmoothInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(base_intrapreds_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_intrapreds_, 0, sizeof(base_intrapreds_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        IntraPredInit_SSE4_1();
+        IntraPredSmoothInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      IntraPredInit_NEON();
+      IntraPredSmoothInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_intrapreds_, dsp->intra_predictors[tx_size_],
+           sizeof(cur_intrapreds_));
+
+    for (int i = 0; i < kNumIntraPredictors; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_intrapreds_[i] == base_intrapreds_[i]) {
+        cur_intrapreds_[i] = nullptr;
+      }
+    }
+  }
+
+  // These tests modify intra_pred_mem_.
+  void TestSpeed(const char* const digests[kNumIntraPredictors], int num_runs);
+  void TestSaturatedValues();
+  void TestRandomValues();
+
+  IntraPredictorFunc base_intrapreds_[kNumIntraPredictors];
+  IntraPredictorFunc cur_intrapreds_[kNumIntraPredictors];
+};
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSpeed(
+    const char* const digests[kNumIntraPredictors], const int num_runs) {
+  ASSERT_NE(digests, nullptr);
+  const auto* const left =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.left_mem + 16);
+  const auto* const top =
+      reinterpret_cast<const uint8_t*>(intra_pred_mem_.top_mem + 16);
+
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  intra_pred_mem_.Reset(&rnd);
+
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const absl::Time start = absl::Now();
+    for (int run = 0; run < num_runs; ++run) {
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(ToString(tx_size_),
+                               ToString(static_cast<IntraPredictor>(i)),
+                               digests[i], intra_pred_mem_.dst,
+                               sizeof(intra_pred_mem_.dst), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestSaturatedValues() {
+  Pixel* const left = intra_pred_mem_.left_mem + 16;
+  Pixel* const top = intra_pred_mem_.top_mem + 16;
+  const auto kMaxPixel = static_cast<Pixel>((1 << bitdepth) - 1);
+  intra_pred_mem_.Set(kMaxPixel);
+
+  // skip DcFill
+  for (int i = 1; i < kNumIntraPredictors; ++i) {
+    if (cur_intrapreds_[i] == nullptr) continue;
+    memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+           sizeof(intra_pred_mem_.dst));
+    const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+    cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+    if (!test_utils::CompareBlocks(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+                                   block_width_, block_height_, kMaxBlockSize,
+                                   kMaxBlockSize, true)) {
+      ADD_FAILURE() << "Expected " << ToString(static_cast<IntraPredictor>(i))
+                    << " to produce a block containing '"
+                    << static_cast<int>(kMaxPixel) << "'";
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void IntraPredTest<bitdepth, Pixel>::TestRandomValues() {
+  // Use an alternate seed to differentiate this test from TestSpeed().
+  libvpx_test::ACMRandom rnd(test_utils::kAlternateDeterministicSeed);
+  for (int i = 0; i < kNumIntraPredictors; ++i) {
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_intrapreds_[i] == nullptr) continue;
+    if (cur_intrapreds_[i] == nullptr) continue;
+    // It may be worthwhile to temporarily increase this loop size when testing
+    // changes that specifically affect this test.
+    for (int n = 0; n < 10000; ++n) {
+      intra_pred_mem_.Reset(&rnd);
+
+      memcpy(intra_pred_mem_.dst, intra_pred_mem_.ref_src,
+             sizeof(intra_pred_mem_.dst));
+      const Pixel* const top = intra_pred_mem_.top_mem + 16;
+      const Pixel* const left = intra_pred_mem_.left_mem + 16;
+      const ptrdiff_t stride = kMaxBlockSize * sizeof(Pixel);
+      base_intrapreds_[i](intra_pred_mem_.ref_src, stride, top, left);
+      cur_intrapreds_[i](intra_pred_mem_.dst, stride, top, left);
+      if (!test_utils::CompareBlocks(
+              intra_pred_mem_.dst, intra_pred_mem_.ref_src, block_width_,
+              block_height_, kMaxBlockSize, kMaxBlockSize, true)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(static_cast<IntraPredictor>(i))
+                      << " differs from reference in iteration #" << n;
+        break;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+using IntraPredTest8bpp = IntraPredTest<8, uint8_t>;
+
+const char* const* GetIntraPredDigests8bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "7b1c762e28747f885d2b7d83cb8aa75c", "73353f179207f1432d40a132809e3a50",
+      "80c9237c838b0ec0674ccb070df633d5", "1cd79116b41fda884e7fa047f5eb14df",
+      "33211425772ee539a59981a2e9dc10c1", "d6f5f65a267f0e9a2752e8151cc1dcd7",
+      "7ff8c762cb766eb0665682152102ce4b", "2276b861ae4599de15938651961907ec",
+      "766982bc69f4aaaa8e71014c2dc219bc", "e2c31b5fd2199c49e17c31610339ab3f",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "0a0d8641ecfa0e82f541acdc894d5574", "1a40371af6cff9c278c5b0def9e4b3e7",
+      "3631a7a99569663b514f15b590523822", "646c7b592136285bd31501494e7393e7",
+      "ecbe89cc64dc2688123d3cfe865b5237", "79048e70ecbb7d43a4703f62718588c0",
+      "f3de11bf1198a00675d806d29c41d676", "32bb6cd018f6e871c342fcc21c7180cf",
+      "6f076a1e5ab3d69cf08811d62293e4be", "2a84460a8b189b4589824cf6b3b39954",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "cb8240be98444ede5ae98ca94afc1557", "460acbcf825a1fa0d8f2aa6bf2d6a21c",
+      "7896fdbbfe538dce1dc3a5b0873d74b0", "504aea29c6b27f21555d5516b8de2d8a",
+      "c5738e7fa82b91ea0e39232120da56ea", "19abbd934c243a6d9df7585d81332dd5",
+      "9e42b7b342e45c842dfa8aedaddbdfaa", "0e9eb07a89f8bf96bc219d5d1c3d9f6d",
+      "659393c31633e0f498bae384c9df5c7b", "bee3a28312da99dd550ec309ae4fff25",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "5950744064518f77867c8e14ebd8b5d7", "46b6cbdc76efd03f4ac77870d54739f7",
+      "efe21fd1b98cb1663950e0bf49483b3b", "3c647b64760b298092cbb8e2f5c06bfd",
+      "c3595929687ffb04c59b128d56e2632f", "d89ad2ddf8a74a520fdd1d7019fd75b4",
+      "53907cb70ad597ee5885f6c58201f98b", "09d2282a29008b7fb47eb60ed6653d06",
+      "e341fc1c910d7cb2dac5dbc58b9c9af9", "a8fabd4c259b607a90a2e4d18cae49de",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "06fb7cb52719855a38b4883b4b241749", "2013aafd42a4303efb553e42264ab8b0",
+      "2f070511d5680c12ca73a20e47fd6e23", "9923705af63e454392625794d5459fe0",
+      "04007a0d39778621266e2208a22c4fac", "2d296c202d36b4a53f1eaddda274e4a1",
+      "c87806c220d125c7563c2928e836fbbd", "339b49710a0099087e51ab5afc8d8713",
+      "c90fbc020afd9327bf35dccae099bf77", "95b356a7c346334d29294a5e2d13cfd9",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "3c5a4574d96b5bb1013429636554e761", "8cf56b17c52d25eb785685f2ab48b194",
+      "7911e2e02abfbe226f17529ac5db08fc", "064e509948982f66a14293f406d88d42",
+      "5c443aa713891406d5be3af4b3cf67c6", "5d2cb98e532822ca701110cda9ada968",
+      "3d58836e17918b8890012dd96b95bb9d", "20e8d61ddc451b9e553a294073349ffd",
+      "a9aa6cf9d0dcf1977a1853ccc264e40b", "103859f85750153f47b81f68ab7881f2",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "b393a2db7a76acaccc39e04d9dc3e8ac", "bbda713ee075a7ef095f0f479b5a1f82",
+      "f337dce3980f70730d6f6c2c756e3b62", "796189b05dc026e865c9e95491b255d1",
+      "ea932c21e7189eeb215c1990491320ab", "a9fffdf9455eba5e3b01317cae140289",
+      "9525dbfdbf5fba61ef9c7aa5fe887503", "8c6a7e3717ff8a459f415c79bb17341c",
+      "3761071bfaa2363a315fe07223f95a2d", "0e5aeb9b3f485b90df750469f60c15aa",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "1c0a950b3ac500def73b165b6a38467c", "95e7f7300f19da280c6a506e40304462",
+      "28a6af15e31f76d3ff189012475d78f5", "e330d67b859bceef62b96fc9e1f49a34",
+      "36eca3b8083ce2fb5f7e6227dfc34e71", "08f567d2abaa8e83e4d9b33b3f709538",
+      "dc2d0ba13aa9369446932f03b53dc77d", "9ab342944c4b1357aa79d39d7bebdd3a",
+      "77ec278c5086c88b91d68eef561ed517", "60fbe11bfe216c182aaacdec326c4dae",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "053a2bc4b5b7287fee524af4e77f077a", "619b720b13f14f32391a99ea7ff550d5",
+      "728d61c11b06baf7fe77881003a918b9", "889997b89a44c9976cb34f573e2b1eea",
+      "b43bfc31d1c770bb9ca5ca158c9beec4", "9d3fe9f762e0c6e4f114042147c50c7f",
+      "c74fdd7c9938603b01e7ecf9fdf08d61", "870c7336db1102f80f74526bd5a7cf4e",
+      "3fd5354a6190903d6a0b661fe177daf6", "409ca6b0b2558aeadf5ef2b8a887e67a",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "1fa9e2086f6594bda60c30384fbf1635", "2098d2a030cd7c6be613edc74dc2faf8",
+      "f3c72b0c8e73f1ddca04d14f52d194d8", "6b31f2ee24cf88d3844a2fc67e1f39f3",
+      "d91a22a83575e9359c5e4871ab30ddca", "24c32a0d38b4413d2ef9bf1f842c8634",
+      "6e9e47bf9da9b2b9ae293e0bbd8ff086", "968b82804b5200b074bcdba9718140d4",
+      "4e6d7e612c5ae0bbdcc51a453cd1db3f", "ce763a41977647d072f33e277d69c7b9",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "01afd04432026ff56327d6226b720be2", "a6e7be906cc6f1e7a520151bfa7c303d",
+      "bc05c46f18d0638f0228f1de64f07cd5", "204e613e429935f721a5b29cec7d44bb",
+      "aa0a7c9a7482dfc06d9685072fc5bafd", "ffb60f090d83c624bb4f7dc3a630ac4f",
+      "36bcb9ca9bb5eac520b050409de25da5", "34d9a5dd3363668391bc3bd05b468182",
+      "1e149c28db8b234e43931c347a523794", "6e8aff02470f177c3ff4416db79fc508",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "727797ef15ccd8d325476fe8f12006a3", "f77c544ac8035e01920deae40cee7b07",
+      "12b0c69595328c465e0b25e0c9e3e9fc", "3b2a053ee8b05a8ac35ad23b0422a151",
+      "f3be77c0fe67eb5d9d515e92bec21eb7", "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894", "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3", "9943ae1bbeeebfe1d3a92dc39e049d63",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "4da55401331ed98acec0c516d7307513", "0ae6f3974701a5e6c20baccd26b4ca52",
+      "79b799f1eb77d5189535dc4e18873a0e", "90e943adf3de4f913864dce4e52b4894",
+      "5e1b9cc800a89ef45f5bdcc9e99e4e96", "3103405df20d254cbf32ac30872ead4b",
+      "648550e369b77687bff3c7d6f249b02f", "f9f73bcd8aadfc059fa260325df957a1",
+      "204cef70d741c25d4fe2b1d10d2649a5", "04c05e18488496eba64100faa25e8baf",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "86ad1e1047abaf9959150222e8f19593", "1908cbe04eb4e5c9d35f1af7ffd7ee72",
+      "6ad3bb37ebe8374b0a4c2d18fe3ebb6a", "08d3cfe7a1148bff55eb6166da3378c6",
+      "656a722394764d17b6c42401b9e0ad3b", "4aa00c192102efeb325883737e562f0d",
+      "9881a90ca88bca4297073e60b3bb771a", "8cd74aada398a3d770fc3ace38ecd311",
+      "0a927e3f5ff8e8338984172cc0653b13", "d881d68b4eb3ee844e35e04ad6721f5f",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "1303ca680644e3d8c9ffd4185bb2835b", "2a4d9f5cc8da307d4cf7dc021df10ba9",
+      "ced60d3f4e4b011a6a0314dd8a4b1fd8", "ced60d3f4e4b011a6a0314dd8a4b1fd8",
+      "1464b01aa928e9bd82c66bad0f921693", "90deadfb13d7c3b855ba21b326c1e202",
+      "af96a74f8033dff010e53a8521bc6f63", "9f1039f2ef082aaee69fcb7d749037c2",
+      "3f82893e478e204f2d254b34222d14dc", "ddb2b95ffb65b84dd4ff1f7256223305",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "e1e8ed803236367821981500a3d9eebe", "0f46d124ba9f48cdd5d5290acf786d6d",
+      "4e2a2cfd8f56f15939bdfc753145b303", "0ce332b343934b34cd4417725faa85cb",
+      "1d2f8e48e3adb7c448be05d9f66f4954", "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a", "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d", "ce5b238e19d85ef69d85badfab4e63ae",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "de1b736e9d99129609d6ef3a491507a0", "516d8f6eb054d74d150e7b444185b6b9",
+      "69e462c3338a9aaf993c3f7cfbc15649", "821b76b1494d4f84d20817840f719a1a",
+      "fd9b4276e7affe1e0e4ce4f428058994", "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779", "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825", "879792198071c7e0b50b9b5010d8c18f",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "e48e1ac15e97191a8fda08d62fff343e", "80c15b303235f9bc2259027bb92dfdc4",
+      "538424b24bd0830f21788e7238ca762f", "a6c5aeb722615089efbca80b02951ceb",
+      "12604b37875533665078405ef4582e35", "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b", "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95", "e0ef7f0559c1a50ec5a8c12011b962f7",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "a1650dbcd56e10288c3e269eca37967d", "be91585259bc37bf4dc1651936e90b3e",
+      "afe020786b83b793c2bbd9468097ff6e", "6e1094fa7b50bc813aa2ba29f5df8755",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950", "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784", "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91", "df1df0376cc944afe7e74e94f53e575a",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest8bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest8bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests8bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest8bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest8bpp, Random) { TestRandomValues(); }
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using IntraPredTest10bpp = IntraPredTest<10, uint16_t>;
+
+const char* const* GetIntraPredDigests10bpp(TransformSize tx_size) {
+  static const char* const kDigests4x4[kNumIntraPredictors] = {
+      "432bf9e762416bec582cb3654cbc4545", "8b9707ff4d506e0cb326f2d9a8d78705",
+      "a076275258cc5af87ed8b075136fb219", "f9587004012a8d2cecaa347331ccdf96",
+      "1c4e6890c5e6eed495fe54a6b6df8d6f", "0ae15fae8969a3c972ee895f325955a3",
+      "97db177738b831da8066df4f3fb7adbd", "4add5685b8a56991c9dce4ff7086ec25",
+      "75c6a655256188e378e70658b8f1631f", "14a27db20f9d5594ef74a7ea10c3e5ef",
+  };
+  static const char* const kDigests4x8[kNumIntraPredictors] = {
+      "9cbd7c18aca2737fa41db27150798819", "13d1e734692e27339c10b07da33c1113",
+      "0617cf74e2dd5d34ea517af1767fa47e", "c6a7b01228ccdf74af8528ef8f5f55c6",
+      "13b05d87b3d566b2f7a4b332cd8a762e", "b26ae0e8da1fe8989dfe2900fa2c3847",
+      "c30f3acdd386bdac91028fe48b751810", "04d2baf5192c5af97ca18d3b9b0d5968",
+      "a0ef82983822fc815bf1e8326cd41e33", "20bf218bae5f6b5c6d56b85f3f9bbadb",
+  };
+  static const char* const kDigests4x16[kNumIntraPredictors] = {
+      "d9b47bdddaa5e22312ff9ece7a3cae08", "cb76c79971b502dd8999a7047b3e2f86",
+      "3b09a3ff431d03b379acfdc444602540", "88608f6fcd687831e871053723cf76c3",
+      "a7bd2a17de1cf19c9a4b2c550f277a5c", "29b389f564f266a67687b8d2bc750418",
+      "4680847c30fe93c06f87e2ee1da544d6", "0e4eda11e1fe6ebe8526c2a2c5390bbb",
+      "bf3e20197282885acabb158f3a77ba59", "fccea71d1a253316b905f4a073c84a36",
+  };
+  static const char* const kDigests8x4[kNumIntraPredictors] = {
+      "05ba0ed96aac48cd94e7597f12184320", "d97d04e791904d3cedc34d5430a4d1d2",
+      "49217081a169c2d30b0a43f816d0b58b", "09e2a6a6bfe35b83e9434ee9c8dcf417",
+      "4b03c8822169ee4fa058513d65f0e32f", "cabdeebc923837ee3f2d3480354d6a81",
+      "957eda610a23a011ed25976aee94eaf0", "4a197e3dfce1f0d3870138a9b66423aa",
+      "18c0d0fbe0e96a0baf2f98fa1908cbb9", "21114e5737328cdbba9940e4f85a0855",
+  };
+  static const char* const kDigests8x8[kNumIntraPredictors] = {
+      "430e99eecda7e6434e1973dbdcc2a29d", "88864d7402c09b57735db49c58707304",
+      "8312f80b936380ceb51375e29a4fd75d", "472a7ed9c68bdbd9ecca197b7a8b3f01",
+      "4f66ee4dc0cb752c3b65d576cd06bb5c", "36383d6f61799143470129e2d5241a6f",
+      "c96279406c8d2d02771903e93a4e8d37", "4fb64f9700ed0bf08fbe7ab958535348",
+      "c008c33453ac9cf8c42ae6ec88f9941c", "39c401a9938b23e318ae7819e458daf1",
+  };
+  static const char* const kDigests8x16[kNumIntraPredictors] = {
+      "bda6b75fedfe0705f9732ff84c918672", "4ff130a47429e0762386557018ec10b2",
+      "8156557bf938d8e3a266318e57048fc5", "bdfa8e01a825ec7ae2d80519e3c94eec",
+      "108fc8e5608fe09f9cc30d7a52cbc0c1", "a2271660af5424b64c6399ca5509dee1",
+      "b09af9729f39516b28ff62363f8c0cb2", "4fe67869dac99048dfcf4d4e621884ec",
+      "311f498369a9c98f77a961bf91e73e65", "d66e78b9f41d5ee6a4b25e37ec9af324",
+  };
+  static const char* const kDigests8x32[kNumIntraPredictors] = {
+      "26c45325f02521e7e5c66c0aa0819329", "79dfb68513d4ccd2530c485f0367858e",
+      "8288e99b4d738b13956882c3ad3f03fe", "7c4993518b1620b8be8872581bb72239",
+      "2b1c3126012d981f787ed0a2601ee377", "051ba9f0c4d4fecb1fcd81fdea94cae4",
+      "320362239ad402087303a4df39512bb1", "210df35b2055c9c01b9e3e5ae24e524b",
+      "f8536db74ce68c0081bbd8799dac25f9", "27f2fe316854282579906d071af6b705",
+  };
+  static const char* const kDigests16x4[kNumIntraPredictors] = {
+      "decff67721ff7e9e65ec641e78f5ccf3", "99e3b2fbdabfa9b76b749cfb6530a9fd",
+      "accdb3d25629916963a069f1e1c0e061", "ad42855e9146748b0e235b8428487b4b",
+      "53025e465f267e7af2896ebd028447a0", "577d26fcd2d655cc77a1f1f875648699",
+      "7a61a3619267221b448b20723840e9f0", "fb4ccc569bdae3614e87bc5be1e84284",
+      "b866095d8a3e6910cc4f92f8d8d6075a", "6ba9013cba1624872bfbac111e8d344a",
+  };
+  static const char* const kDigests16x8[kNumIntraPredictors] = {
+      "2832156bd076c75f8be5622f34cb3efe", "da70e516f5a8842dd4965b80cd8d2a76",
+      "c3e137c6d79c57be2073d1eda22c8d1e", "8c5d28c7b3301b50326582dd7f89a175",
+      "9d8558775155b201cd178ab61458b642", "ecbddb9c6808e0c609c8fe537b7f7408",
+      "29a123c22cb4020170f9a80edf1208da", "653d0cd0688aa682334156f7b4599b34",
+      "1bfa66ae92a22a0346511db1713fe7df", "1802ad1e657e7fc08fc063342f471ca1",
+  };
+  static const char* const kDigests16x16[kNumIntraPredictors] = {
+      "2270c626de9d49769660ae9184a6428f", "9f069625cdcdd856e2e7ec19ff4fcd50",
+      "34167b9c413362a377aa7b1faf92ae6d", "3cec2b23d179765daea8dfb87c9efdd5",
+      "daa8f0863a5df2aef2b20999961cc8f8", "d9e4dd4bc63991e4f09cb97eb25f4db4",
+      "4e1a182fc3fcf5b9f5a73898f81c2004", "c58e4275406c9fd1c2a74b40c27afff0",
+      "b8092796fd4e4dd9d2b92afb770129ba", "75424d1f18ff00c4093743d033c6c9b6",
+  };
+  static const char* const kDigests16x32[kNumIntraPredictors] = {
+      "5aa050947f3d488537f5a68c23bb135b", "9e66143a2c3863b6fe171275a192d378",
+      "86b0c4777625e84d52913073d234f860", "9e2144fcf2107c76cec4241416bbecd5",
+      "c72be592efc72c3c86f2359b6f622aba", "c4e0e735545f78f43e21e9c39eab7b8f",
+      "52122e7c84a4bab67a8a359efb427023", "7b5fd8bb7e0744e81fd6fa4ed4c2e0fb",
+      "a9950d110bffb0411a8fcd1262dceef0", "2a2dd496f01f5d87f257ed202a703cbe",
+  };
+  static const char* const kDigests16x64[kNumIntraPredictors] = {
+      "eeb1b873e81ca428b11f162bd5b28843", "39ce7d22791f82562b0ca1e0afdf1604",
+      "6bd6bdac8982a4b84613f9963d35d5e9", "a9ac2438e87522621c7e6fe6d02c01ab",
+      "a8b9c471fe6c66ed0717e77fea77bba1", "e050b6aa38aee6e951d3be5a94a8abd0",
+      "3c5ecc31aa45e8175d37e90af247bca6", "30c0f9e412ea726970f575f910edfb94",
+      "f3d96395816ce58fb98480a5b4c32ab2", "9c14811957e013fb009dcd4a3716b338",
+  };
+  static const char* const kDigests32x8[kNumIntraPredictors] = {
+      "d6560d7fc9ae9bd7c25e2983b4a825e3", "90a67154bbdc26cd06ab0fa25fff3c53",
+      "c42d37c5a634e68fafc982626842db0b", "ecc8646d258cfa431facbc0dba168f80",
+      "9f3c167b790b52242dc8686c68eac389", "62dc3bc34406636ccec0941579461f65",
+      "5c0f0ebdb3c936d4decc40d5261aec7c", "dbfc0f056ca25e0331042da6d292e10a",
+      "14fa525d74e6774781198418d505c595", "5f95e70db03da9ed70cd79e23f19199c",
+  };
+  static const char* const kDigests32x16[kNumIntraPredictors] = {
+      "dfe3630aa9eeb1adcc8604269a309f26", "ba6180227d09f5a573f69dc6ee1faf80",
+      "03edea9d71ca3d588e1a0a69aecdf555", "2c8805415f44b4fac6692090dc1b1ddd",
+      "18efd17ed72a6e92ef8b0a692cf7a2e3", "63a6e0abfb839b43c68c23b2c43c8918",
+      "be15479205bb60f5a17baaa81a6b47ad", "243d21e1d9f9dd2b981292ac7769315a",
+      "21de1cb5269e0e1d08930c519e676bf7", "73065b3e27e9c4a3a6d043712d3d8b25",
+  };
+  static const char* const kDigests32x32[kNumIntraPredictors] = {
+      "c3136bb829088e33401b1affef91f692", "68bbcf93d17366db38bbc7605e07e322",
+      "2786be5fb7c25eeec4d2596c4154c3eb", "25ac7468e691753b8291be859aac7493",
+      "a6805ce21bfd26760e749efc8f590fa3", "5a38fd324b466e8ac43f5e289d38107e",
+      "dd0628fc5cc920b82aa941378fa907c8", "8debadbdb2dec3dc7eb43927e9d36998",
+      "61e1bc223c9e04c64152cc4531b6c099", "900b00ac1f20c0a8d22f8b026c0ee1cc",
+  };
+  static const char* const kDigests32x64[kNumIntraPredictors] = {
+      "5a591b2b83f0a6cce3c57ce164a5f983", "f42167ec516102b83b2c5176df57316b",
+      "58f3772d3df511c8289b340beb178d96", "c24166e7dc252d34ac6f92712956d751",
+      "7dca3acfe2ea09e6292a9ece2078b827", "5c029235fc0820804e40187d2b22a96e",
+      "375572944368afbc04ca97dab7fb3328", "8867235908736fd99c4022e4ed604e6e",
+      "63ec336034d62846b75558c49082870f", "46f35d85eb8499d61bfeac1c49e52531",
+  };
+  static const char* const kDigests64x16[kNumIntraPredictors] = {
+      "67755882209304659a0e6bfc324e16b9", "cd89b272fecb5f23431b3f606f590722",
+      "9bcff7d971a4af0a2d1cac6d66d83482", "d8d6bb55ebeec4f03926908d391e15ba",
+      "0eb5b5ced3e7177a1dd6a1e72e7a7d21", "92b47fe431d9cf66f9e601854f0f3017",
+      "7dc599557eddb2ea480f86fc89c76b30", "4f40175676c164320fe8005440ad9217",
+      "b00eacb24081a041127f136e9e5983ec", "cb0ab76a5e90f2eb75c38b99b9833ff8",
+  };
+  static const char* const kDigests64x32[kNumIntraPredictors] = {
+      "21d873011d1b4ef1daedd9aa8c6938ea", "4866da21db0261f738903d97081cb785",
+      "a722112233a82595a8d001a4078b834d", "24c7a133c6fcb59129c3782ef908a6c1",
+      "490e40505dd255d3a909d8a72c280cbc", "2afe719fb30bf2a664829bb74c8f9e2a",
+      "623adad2ebb8f23e355cd77ace4616cd", "d6092541e9262ad009bef79a5d350a86",
+      "ae86d8fba088683ced8abfd7e1ddf380", "32aa8aa21f2f24333d31f99e12b95c53",
+  };
+  static const char* const kDigests64x64[kNumIntraPredictors] = {
+      "6d88aeb40dfe3ac43c68808ca3c00806", "6a75d88ac291d6a3aaf0eec0ddf2aa65",
+      "30ef52d7dc451affdd587c209f5cb2dd", "e073f7969f392258eaa907cf0636452a",
+      "de10f07016a2343bcd3a9deb29f4361e", "dc35ff273fea4355d2c8351c2ed14e6e",
+      "01b9a545968ac75c3639ddabb837fa0b", "85c98ed9c0ea1523a15281bc9a909b8c",
+      "4c255f7ef7fd46db83f323806d79dca4", "fe2fe6ffb19cb8330e2f2534271d6522",
+  };
+
+  switch (tx_size) {
+    case kTransformSize4x4:
+      return kDigests4x4;
+    case kTransformSize4x8:
+      return kDigests4x8;
+    case kTransformSize4x16:
+      return kDigests4x16;
+    case kTransformSize8x4:
+      return kDigests8x4;
+    case kTransformSize8x8:
+      return kDigests8x8;
+    case kTransformSize8x16:
+      return kDigests8x16;
+    case kTransformSize8x32:
+      return kDigests8x32;
+    case kTransformSize16x4:
+      return kDigests16x4;
+    case kTransformSize16x8:
+      return kDigests16x8;
+    case kTransformSize16x16:
+      return kDigests16x16;
+    case kTransformSize16x32:
+      return kDigests16x32;
+    case kTransformSize16x64:
+      return kDigests16x64;
+    case kTransformSize32x8:
+      return kDigests32x8;
+    case kTransformSize32x16:
+      return kDigests32x16;
+    case kTransformSize32x32:
+      return kDigests32x32;
+    case kTransformSize32x64:
+      return kDigests32x64;
+    case kTransformSize64x16:
+      return kDigests64x16;
+    case kTransformSize64x32:
+      return kDigests64x32;
+    case kTransformSize64x64:
+      return kDigests64x64;
+    default:
+      ADD_FAILURE() << "Unknown transform size: " << tx_size;
+      return nullptr;
+  }
+}
+
+TEST_P(IntraPredTest10bpp, DISABLED_Speed) {
+  const auto num_runs =
+      static_cast<int>(2.0e9 / (block_width_ * block_height_));
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), num_runs);
+}
+
+TEST_P(IntraPredTest10bpp, FixedInput) {
+  TestSpeed(GetIntraPredDigests10bpp(tx_size_), 1);
+}
+
+TEST_P(IntraPredTest10bpp, Overflow) { TestSaturatedValues(); }
+TEST_P(IntraPredTest10bpp, Random) { TestRandomValues(); }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+constexpr TransformSize kTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest8bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, IntraPredTest10bpp,
+                         testing::ValuesIn(kTransformSizes));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize tx_size) {
+  return os << ToString(tx_size);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc
new file mode 100644
index 0000000..1b0064f
--- /dev/null
+++ b/src/dsp/inverse_transform.cc
@@ -0,0 +1,1630 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+constexpr uint8_t kTransformColumnShift = 4;
+
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+#endif
+
+int32_t RangeCheckValue(int32_t value, int8_t range) {
+#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \
+    LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  assert(range <= 32);
+  const auto min = static_cast<int32_t>(-(uint32_t{1} << (range - 1)));
+  const auto max = static_cast<int32_t>((uint32_t{1} << (range - 1)) - 1);
+  if (min > value || value > max) {
+    LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n",
+                 value, range);
+    assert(min <= value && value <= max);
+  }
+#endif  // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK
+  static_cast<void>(range);
+  return value;
+}
+
+template <typename Residual>
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a,
+                                               int b, int angle, bool flip,
+                                               int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const int64_t x = static_cast<int64_t>(dst[a] * Cos128(angle)) -
+                    static_cast<int64_t>(dst[b] * Sin128(angle));
+  const int64_t y = static_cast<int64_t>(dst[a] * Sin128(angle)) +
+                    static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b,
+                                    int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[b] * -Sin128(angle));
+  const auto y = static_cast<int64_t>(dst[b] * Cos128(angle));
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b,
+                                     int angle, bool flip, int8_t range) {
+  // Note that we multiply in 32 bits and then add/subtract the products in 64
+  // bits. The 32-bit multiplications do not overflow. Please see the comment
+  // and assert() in Cos128().
+  const auto x = static_cast<int64_t>(dst[a] * Cos128(angle));
+  const auto y = static_cast<int64_t>(dst[a] * Sin128(angle));
+
+  // Section 7.13.2.1: It is a requirement of bitstream conformance that the
+  // values saved into the array T by this function are representable by a
+  // signed integer using |range| bits of precision.
+  dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range);
+  dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range);
+}
+
+template <typename Residual>
+void HadamardRotation_C(Residual* const dst, int a, int b, bool flip,
+                        int8_t range) {
+  if (flip) std::swap(a, b);
+  --range;
+  // For Adst and Dct, the maximum possible value for range is 20. So min and
+  // max should always fit into int32_t.
+  const int32_t min = -(1 << range);
+  const int32_t max = (1 << range) - 1;
+  const int32_t x = dst[a] + dst[b];
+  const int32_t y = dst[a] - dst[b];
+  dst[a] = Clip3(x, min, max);
+  dst[b] = Clip3(y, min, max);
+}
+
+template <int bitdepth, typename Residual>
+void ClampIntermediate(Residual* const dst, int size) {
+  // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+  // clip residual[i][j] to 16 bits.
+  if (sizeof(Residual) > 2) {
+    const Residual intermediate_clamp_max =
+        (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+    const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+    for (int j = 0; j < size; ++j) {
+      dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+// Value for index (i, j) is computed as bitreverse(j) and interpreting that as
+// an integer with bit-length i + 2.
+// For e.g. index (2, 3) will be computed as follows:
+//   * bitreverse(3) = bitreverse(..000011) = 110000...
+//   * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12
+constexpr uint8_t kBitReverseLookup[kNumTransform1dSizes][64] = {
+    {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2,
+     1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3,
+     0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3},
+    {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5,
+     3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6,
+     1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7},
+    {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+     0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15},
+    {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+     0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+     1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31},
+    {0, 32, 16, 48, 8,  40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+     2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+     1, 33, 17, 49, 9,  41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+     3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
+
+template <typename Residual, int size_log2>
+void Dct_C(void* dest, int8_t range) {
+  static_assert(size_log2 >= 2 && size_log2 <= 6, "");
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  const int size = 1 << size_log2;
+  Residual temp[size];
+  memcpy(temp, dst, sizeof(temp));
+  for (int i = 0; i < size; ++i) {
+    dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
+  }
+  // stages 2-32 are dependent on the value of size_log2.
+  // stage 2.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      ButterflyRotation_C(dst, i + 32, 63 - i,
+                          63 - MultiplyBy4(kBitReverseLookup[2][i]), false,
+                          range);
+    }
+  }
+  // stage 3
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, i + 16, 31 - i,
+                          6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false,
+                          range);
+    }
+  }
+  // stage 4.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 5.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, i + 8, 15 - i,
+                          12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false,
+                          range);
+    }
+  }
+  // stage 6.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 7.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33,
+            60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true,
+            range);
+      }
+    }
+  }
+  // stage 8.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range);
+    }
+  }
+  // stage 9.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                         static_cast<bool>(i & 1), range);
+    }
+  }
+  // stage 10.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        ButterflyRotation_C(
+            dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17,
+            24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range);
+      }
+    }
+  }
+  // stage 11.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 32,
+                           MultiplyBy4(i) - j + 35, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 12.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i,
+                        i == 0, range);
+  }
+  // stage 13.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 2; ++i) {
+      HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
+                         /*flip=*/i != 0, range);
+    }
+  }
+  // stage 14.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range);
+    }
+  }
+  // stage 15.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 16,
+                           MultiplyBy4(i) - j + 19, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 16.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        ButterflyRotation_C(
+            dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34,
+            56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range);
+      }
+    }
+  }
+  // stage 17.
+  for (int i = 0; i < 2; ++i) {
+    HadamardRotation_C(dst, i, 3 - i, false, range);
+  }
+  // stage 18.
+  if (size_log2 >= 3) {
+    ButterflyRotation_C(dst, 6, 5, 32, true, range);
+  }
+  // stage 19.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
+                           /*flip=*/i != 0, range);
+      }
+    }
+  }
+  // stage 20.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true,
+                          range);
+    }
+  }
+  // stage 21.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 32,
+                           MultiplyBy8(i) - j + 39, static_cast<bool>(i & 1),
+                           range);
+      }
+    }
+  }
+  // stage 22.
+  if (size_log2 >= 3) {
+    for (int i = 0; i < 4; ++i) {
+      HadamardRotation_C(dst, i, 7 - i, false, range);
+    }
+  }
+  // stage 23.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 2; ++i) {
+      ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range);
+    }
+  }
+  // stage 24.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        HadamardRotation_C(dst, MultiplyBy8(i) + j + 16,
+                           MultiplyBy8(i) - j + 23, i == 1, range);
+      }
+    }
+  }
+  // stage 25.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range);
+    }
+  }
+  // stage 26.
+  if (size_log2 >= 4) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i, 15 - i, false, range);
+    }
+  }
+  // stage 27.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 4; ++i) {
+      ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range);
+    }
+  }
+  // stage 28.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      HadamardRotation_C(dst, i + 32, 47 - i, false, range);
+      HadamardRotation_C(dst, i + 48, 63 - i, true, range);
+    }
+  }
+  // stage 29.
+  if (size_log2 >= 5) {
+    for (int i = 0; i < 16; ++i) {
+      HadamardRotation_C(dst, i, 31 - i, false, range);
+    }
+  }
+  // stage 30.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 8; ++i) {
+      ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range);
+    }
+  }
+  // stage 31.
+  if (size_log2 == 6) {
+    for (int i = 0; i < 32; ++i) {
+      HadamardRotation_C(dst, i, 63 - i, false, range);
+    }
+  }
+}
+
+template <int bitdepth, typename Residual, int size_log2>
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                 bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range);
+
+  if (is_row && row_shift > 0) {
+    dst[0] = RightShiftWithRounding(dst[0], row_shift);
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 1);
+
+  const int size = 1 << size_log2;
+  for (int i = 1; i < size; ++i) {
+    dst[i] = dst[0];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+/*
+ * Row transform max range in bits for bitdepths 8/10/12: 28/30/32.
+ * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
+ */
+template <typename Residual>
+void Adst4_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
+    return;
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  int32_t s[7];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+  s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+  s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+  s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+  s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
+  // stage 2.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable a7 by this process are representable by a
+  // signed integer using range + 1 bits of precision.
+  const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that
+  // values stored in the variable b7 by this process are representable by a
+  // signed integer using |range| bits of precision.
+  const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
+  // stage 3.
+  s[0] = RangeCheckValue(s[0] + s[3], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[4], range + 12);
+  s[3] = s[2];
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12);
+  // stage 4.
+  s[0] = RangeCheckValue(s[0] + s[5], range + 12);
+  s[1] = RangeCheckValue(s[1] - s[6], range + 12);
+  // stages 5 and 6.
+  const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12);
+  const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12);
+  int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12);
+  x3 = RangeCheckValue(x3 - s[3], range + 12);
+  int32_t dst_0 = RightShiftWithRounding(x0, 12);
+  int32_t dst_1 = RightShiftWithRounding(x1, 12);
+  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+  int32_t dst_3 = RightShiftWithRounding(x3, 12);
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+}
+
+template <int bitdepth, typename Residual>
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row && should_round) {
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 1.
+  // Section 7.13.2.6: It is a requirement of bitstream conformance that all
+  // values stored in the s and x arrays by this process are representable by
+  // a signed integer using range + 12 bits of precision.
+  int32_t s[3];
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12);
+  // stage 3.
+  // stage 4.
+  // stages 5 and 6.
+  int32_t dst_0 = RightShiftWithRounding(s[0], 12);
+  int32_t dst_1 = RightShiftWithRounding(s[1], 12);
+  int32_t dst_2 = RightShiftWithRounding(s[2], 12);
+  int32_t dst_3 =
+      RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12);
+  if (sizeof(Residual) == 2) {
+    // If the first argument to RightShiftWithRounding(..., 12) is only
+    // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it
+    // in RightShiftWithRounding(..., 12) will cause the function to return
+    // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff.
+    dst_0 -= (dst_0 == 0x8000);
+    dst_1 -= (dst_1 == 0x8000);
+    dst_3 -= (dst_3 == 0x8000);
+  }
+  dst[0] = dst_0;
+  dst[1] = dst_1;
+  dst[2] = dst_2;
+  dst[3] = dst_3;
+
+  const int size = 4;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+template <typename Residual>
+void AdstInputPermutation(int32_t* LIBGAV1_RESTRICT const dst,
+                          const Residual* LIBGAV1_RESTRICT const src, int n) {
+  assert(n == 8 || n == 16);
+  for (int i = 0; i < n; ++i) {
+    dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1];
+  }
+}
+
+constexpr int8_t kAdstOutputPermutationLookup[16] = {
+    0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1};
+
+template <typename Residual>
+void AdstOutputPermutation(Residual* LIBGAV1_RESTRICT const dst,
+                           const int32_t* LIBGAV1_RESTRICT const src, int n) {
+  assert(n == 8 || n == 16);
+  const auto shift = static_cast<int8_t>(n == 8);
+  for (int i = 0; i < n; ++i) {
+    const int8_t index = kAdstOutputPermutationLookup[i] >> shift;
+    int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index];
+    if (sizeof(Residual) == 2) {
+      // If i is odd and src[index] is -32768, dst_i will be 32768, which
+      // cannot be represented as an int16_t.
+      dst_i -= (dst_i == 0x8000);
+    }
+    dst[i] = dst_i;
+  }
+}
+
+template <typename Residual>
+void Adst8_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[8];
+  AdstInputPermutation(temp, dst, 8);
+  // stage 2.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 4; ++i) {
+    HadamardRotation_C(temp, i, i + 4, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+}
+
+template <int bitdepth, typename Residual>
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[8];
+  // After the permutation, the dc value is in temp[1]. The remaining are zero.
+  AdstInputPermutation(temp, dst, 8);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range);
+
+  // stage 3.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+
+  // stage 5.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 2, 3, 32, true, range);
+  ButterflyRotation_C(temp, 6, 7, 32, true, range);
+
+  // stage 7.
+  AdstOutputPermutation(dst, temp, 8);
+
+  const int size = 8;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 8);
+}
+
+template <typename Residual>
+void Adst16_C(void* dest, int8_t range) {
+  auto* const dst = static_cast<Residual*>(dest);
+  // stage 1.
+  int32_t temp[16];
+  AdstInputPermutation(temp, dst, 16);
+  // stage 2.
+  for (int i = 0; i < 8; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
+                        true, range);
+  }
+  // stage 3.
+  for (int i = 0; i < 8; ++i) {
+    HadamardRotation_C(temp, i, i + 8, false, range);
+  }
+  // stage 4.
+  for (int i = 0; i < 2; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9,
+                        56 - 32 * i, true, range);
+    ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12,
+                        8 + 32 * i, true, range);
+  }
+  // stage 5.
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4,
+                         false, range);
+    }
+  }
+  // stage 6.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4,
+                          i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range);
+    }
+  }
+  // stage 7.
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2,
+                         false, range);
+    }
+  }
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+}
+
+template <int bitdepth, typename Residual>
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                    bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  // stage 1.
+  int32_t temp[16];
+  // After the permutation, the dc value is in temp[1].  The remaining are zero.
+  AdstInputPermutation(temp, dst, 16);
+
+  if (is_row && should_round) {
+    temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
+  }
+
+  // stage 2.
+  ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range);
+
+  // stage 3.
+  temp[8] = temp[0];
+  temp[9] = temp[1];
+
+  // stage 4.
+  ButterflyRotation_C(temp, 8, 9, 56, true, range);
+
+  // stage 5.
+  temp[4] = temp[0];
+  temp[5] = temp[1];
+  temp[12] = temp[8];
+  temp[13] = temp[9];
+
+  // stage 6.
+  ButterflyRotation_C(temp, 4, 5, 48, true, range);
+  ButterflyRotation_C(temp, 12, 13, 48, true, range);
+
+  // stage 7.
+  temp[2] = temp[0];
+  temp[3] = temp[1];
+  temp[10] = temp[8];
+  temp[11] = temp[9];
+
+  temp[6] = temp[4];
+  temp[7] = temp[5];
+  temp[14] = temp[12];
+  temp[15] = temp[13];
+
+  // stage 8.
+  for (int i = 0; i < 4; ++i) {
+    ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true,
+                        range);
+  }
+
+  // stage 9.
+  AdstOutputPermutation(dst, temp, 16);
+
+  const int size = 16;
+  if (is_row && row_shift > 0) {
+    for (int j = 0; j < size; ++j) {
+      dst[j] = RightShiftWithRounding(dst[j], row_shift);
+    }
+  }
+
+  ClampIntermediate<bitdepth, Residual>(dst, 16);
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+//
+// In the spec, the inverse identity transform is followed by a Round2() call:
+//   The row transforms with i = 0..(h-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2W.
+//     * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift )
+//       for j = 0..(w-1).
+//   ...
+//   The column transforms with j = 0..(w-1) are applied as follows:
+//     ...
+//     * Otherwise, invoke the inverse identity transform process specified in
+//       section 7.13.2.15 with the input variable n equal to log2H.
+//     * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift )
+//       for i = 0..(h-1).
+//
+// Therefore, we define the identity transform functions to perform both the
+// inverse identity transform and the Round2() call. This has two advantages:
+// 1. The outputs of the inverse identity transform do not need to be stored
+//    in the Residual array. They can be stored in int32_t local variables,
+//    which have a larger range if Residual is an int16_t array.
+// 2. The inverse identity transform and the Round2() call can be jointly
+//    optimized.
+//
+// The identity transform functions have the following prototype:
+//   void Identity_C(void* dest, int8_t shift);
+//
+// The |shift| parameter is the amount of shift for the Round2() call. For row
+// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
+// 4. Therefore, an identity transform function can detect whether it is being
+// invoked as a row transform or a column transform by checking whether |shift|
+// is equal to 4.
+//
+// Input Range
+//
+// The inputs of row transforms, stored in the 2D array Dequant, are
+// representable by a signed integer using 8 + BitDepth bits of precision:
+//   f. Dequant[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ).
+//
+// The inputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) bits of precision:
+//   Set the variable colClampRange equal to Max( BitDepth + 6, 16 ).
+//   ...
+//   Between the row and column transforms, Residual[ i ][ j ] is set equal to
+//   Clip3( - ( 1 << ( colClampRange - 1 ) ),
+//          ( 1 << (colClampRange - 1 ) ) - 1,
+//          Residual[ i ][ j ] )
+//   for i = 0..(h-1), for j = 0..(w-1).
+//
+// Output Range
+//
+// The outputs of row transforms are representable by a signed integer using
+// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect
+// of the multiplicative factor of inverse identity transforms minus the
+// smallest row shift is an increase of at most one bit.
+//
+// Transform | Multiplicative factor | Smallest row | Net increase
+// width     | (in bits)             | shift        | in bits
+// ---------------------------------------------------------------
+//     4     |  sqrt(2)  (0.5 bits)  |      0       |    +0.5
+//     8     |     2     (1 bit)     |      0       |    +1
+//    16     | 2*sqrt(2) (1.5 bits)  |      1       |    +0.5
+//    32     |     4     (2 bits)    |      1       |    +1
+//
+// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we
+// clip the outputs (which have 17 bits of precision) to the range of int16_t
+// before storing them in the Residual array. This clipping happens to be the
+// same as the required clipping after the row transform (see the spec quoted
+// above), so we remain compliant with the spec. (In this case,
+// TransformLoop_C() skips clipping the outputs of row transforms to avoid
+// duplication of effort.)
+//
+// The outputs of column transforms are representable by a signed integer using
+// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision,
+// because the multiplicative factor of inverse identity transforms is at most
+// 4 (2 bits) and |shift| is always 4.
+
+template <typename Residual>
+void Identity4Row_C(void* dest, int8_t shift) {
+  assert(shift == 0 || shift == 1);
+  auto* const dst = static_cast<Residual*>(dest);
+  // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
+  // should be (1 + (1 << 1)) << 11. The following expression works for both
+  // values of |shift|.
+  const int32_t rounding = (1 + (shift << 1)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity4Multiplier as int32_t.
+    int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 4; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity4Multiplier as int32_t.
+    dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
+                                   (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    const int32_t rounding = (1 + (row_shift << 1)) << 11;
+    int32_t dst_i =
+        (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity8Row_C(void* dest, int8_t shift) {
+  assert(shift == 0 || shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 8; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    // If Residual is int16_t (which implies bitdepth is 8), we don't need to
+    // clip residual[i][j] to 16 bits.
+    if (sizeof(Residual) > 2) {
+      const Residual intermediate_clamp_max =
+          (1 << (std::max(bitdepth + 6, 16) - 1)) - 1;
+      const Residual intermediate_clamp_min = -intermediate_clamp_max - 1;
+      dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max);
+    }
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
+}
+
+template <typename Residual>
+void Identity16Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << shift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity16Multiplier as int32_t.
+    int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  for (int i = 0; i < 16; ++i) {
+    // The intermediate value here will have to fit into an int32_t for it to be
+    // bitstream conformant. The multiplication is promoted to int32_t by
+    // defining kIdentity16Multiplier as int32_t.
+    dst[i] =
+        static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
+                              (12 + kTransformColumnShift));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    const int32_t rounding = (1 + (1 << row_shift)) << 11;
+    int32_t dst_i =
+        (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
+                                 (12 + kTransformColumnShift));
+}
+
+template <typename Residual>
+void Identity32Row_C(void* dest, int8_t shift) {
+  assert(shift == 1 || shift == 2);
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[i] = static_cast<Residual>(dst_i);
+  }
+}
+
+template <typename Residual>
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  for (int i = 0; i < 32; ++i) {
+    dst[i] = static_cast<Residual>(
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
+  }
+}
+
+template <int bitdepth, typename Residual>
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
+  auto* const dst = static_cast<Residual*>(dest);
+
+  if (is_row) {
+    if (should_round) {
+      dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
+    }
+
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift);
+    if (sizeof(Residual) == 2) {
+      dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
+    }
+    dst[0] = static_cast<Residual>(dst_i);
+
+    ClampIntermediate<bitdepth, Residual>(dst, 1);
+    return;
+  }
+
+  dst[0] = static_cast<Residual>(
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+template <typename Residual>
+void Wht4_C(void* dest, int8_t shift) {
+  auto* const dst = static_cast<Residual*>(dest);
+  Residual temp[4];
+  temp[0] = dst[0] >> shift;
+  temp[2] = dst[1] >> shift;
+  temp[3] = dst[2] >> shift;
+  temp[1] = dst[3] >> shift;
+  temp[0] += temp[2];
+  temp[3] -= temp[1];
+  // This signed right shift must be an arithmetic shift.
+  Residual e = (temp[0] - temp[3]) >> 1;
+  dst[1] = e - temp[1];
+  dst[2] = e - temp[2];
+  dst[0] = temp[0] - dst[1];
+  dst[3] = temp[3] + dst[2];
+}
+
+template <int bitdepth, typename Residual>
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+                  int /*row_shift*/, bool /*is_row*/) {
+  auto* const dst = static_cast<Residual*>(dest);
+  const int shift = range;
+
+  Residual temp = dst[0] >> shift;
+  // This signed right shift must be an arithmetic shift.
+  Residual e = temp >> 1;
+  dst[0] = temp - e;
+  dst[1] = e;
+  dst[2] = e;
+  dst[3] = e;
+
+  ClampIntermediate<bitdepth, Residual>(dst, 4);
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loop
+
+using InverseTransform1dFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+                                            bool should_round, int row_shift,
+                                            bool is_row);
+
+template <int bitdepth, typename Residual, typename Pixel,
+          Transform1d transform1d_type,
+          InverseTransformDcOnlyFunc dconly_transform1d,
+          InverseTransform1dFunc transform1d_func, bool is_row>
+void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
+                     int adjusted_tx_height, void* LIBGAV1_RESTRICT src_buffer,
+                     int start_x, int start_y,
+                     void* LIBGAV1_RESTRICT dst_frame) {
+  constexpr bool lossless = transform1d_type == kTransform1dWht;
+  constexpr bool is_identity = transform1d_type == kTransform1dIdentity;
+  // The transform size of the WHT is always 4x4. Setting tx_width and
+  // tx_height to the constant 4 for the WHT speeds the code up.
+  assert(!lossless || tx_size == kTransformSize4x4);
+  const int tx_width = lossless ? 4 : kTransformWidth[tx_size];
+  const int tx_height = lossless ? 4 : kTransformHeight[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+  auto* frame = static_cast<Array2DView<Pixel>*>(dst_frame);
+
+  // Initially this points to the dequantized values. After the transforms are
+  // applied, this buffer contains the residual.
+  Array2DView<Residual> residual(tx_height, tx_width,
+                                 static_cast<Residual*>(src_buffer));
+
+  if (is_row) {
+    // Row transform.
+    const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size];
+    // This is the |range| parameter of the InverseTransform1dFunc.  For lossy
+    // transforms, this will be equal to the clamping range.
+    const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8);
+    // If the width:height ratio of the transform size is 2:1 or 1:2, multiply
+    // the input to the row transform by 1 / sqrt(2), which is approximated by
+    // the fraction 2896 / 2^12.
+    const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
+
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+                         true);
+      return;
+    }
+
+    // Row transforms need to be done only up to 32 because the rest of the rows
+    // are always all zero if |tx_height| is 64.  Otherwise, only process the
+    // rows that have a non zero coefficients.
+    for (int i = 0; i < adjusted_tx_height; ++i) {
+      // If lossless, the transform size is 4x4, so should_round is false.
+      if (!lossless && should_round) {
+        // The last 32 values of every row are always zero if the |tx_width| is
+        // 64.
+        for (int j = 0; j < std::min(tx_width, 32); ++j) {
+          residual[i][j] = RightShiftWithRounding(
+              residual[i][j] * kTransformRowMultiplier, 12);
+        }
+      }
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[j], rowShift) call in the spec.
+      transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
+      if (!lossless && !is_identity && row_shift > 0) {
+        for (int j = 0; j < tx_width; ++j) {
+          residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
+        }
+      }
+
+      ClampIntermediate<bitdepth, Residual>(residual[i], tx_width);
+    }
+    return;
+  }
+
+  assert(!is_row);
+  constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift;
+  // This is the |range| parameter of the InverseTransform1dFunc.  For lossy
+  // transforms, this will be equal to the clamping range.
+  const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16);
+  const bool flip_rows = transform1d_type == kTransform1dAdst &&
+                         kTransformFlipRowsMask.Contains(tx_type);
+  const bool flip_columns =
+      !lossless && kTransformFlipColumnsMask.Contains(tx_type);
+  const int min_value = 0;
+  const int max_value = (1 << bitdepth) - 1;
+  // Note: 64 is the maximum size of a 1D transform buffer (the largest
+  // transform size is kTransformSize64x64).
+  Residual tx_buffer[64];
+  for (int j = 0; j < tx_width; ++j) {
+    const int flipped_j = flip_columns ? tx_width - j - 1 : j;
+    int i = 0;
+    do {
+      tx_buffer[i] = residual[i][flipped_j];
+    } while (++i != tx_height);
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
+    } else {
+      // For identity transform, |transform1d_func| also performs the
+      // Round2(T[i], colShift) call in the spec.
+      transform1d_func(tx_buffer,
+                       is_identity ? column_shift : column_clamp_range);
+    }
+    const int x = start_x + j;
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int index = flip_rows ? tx_height - i - 1 : i;
+      Residual residual_value = tx_buffer[index];
+      if (!lossless && !is_identity) {
+        residual_value = RightShiftWithRounding(residual_value, column_shift);
+      }
+      (*frame)[y][x] =
+          Clip3((*frame)[y][x] + residual_value, min_value, max_value);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+template <int bitdepth, typename Residual, typename Pixel>
+void InitAll(Dsp* const dsp) {
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/false>;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Column_C<Residual>, /*is_row=*/false>;
+
+  // Maximum transform size for Wht is 4.
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, kTransform1dWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/false>;
+}
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<8, int16_t, uint8_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<8, int16_t>,
+                      Identity16Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<8, int16_t>,
+                      Identity32Column_C<int16_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, kTransform1dWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  InitAll<10, int32_t, uint16_t>(dsp);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize64_Transform1dDct
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dAdst
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity4DcOnly_C<10, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize8_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity8DcOnly_C<10, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize16_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity16DcOnly_C<10, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize32_Transform1dIdentity
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dIdentity,
+                      Identity32DcOnly_C<10, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_Transform1dSize4_Transform1dWht
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, kTransform1dWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void InverseTransformInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(RangeCheckValue);
+  static_cast<void>(kBitReverseLookup);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/inverse_transform.h b/src/dsp/inverse_transform.h
new file mode 100644
index 0000000..0916665
--- /dev/null
+++ b/src/dsp/inverse_transform.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/inverse_transform_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/inverse_transform_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms. This function is not thread-safe.
+void InverseTransformInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_
diff --git a/src/dsp/inverse_transform.inc b/src/dsp/inverse_transform.inc
new file mode 100644
index 0000000..55e68b6
--- /dev/null
+++ b/src/dsp/inverse_transform.inc
@@ -0,0 +1,64 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for inverse transform implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)).
+constexpr int16_t kCos128[65] = {
+    4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101,  0};
+
+inline int16_t Cos128(int angle) {
+  angle &= 0xff;
+
+  // If |angle| is 128, this function returns -4096 (= -2^12), which will
+  // cause the 32-bit multiplications in ButterflyRotation() to overflow if
+  // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20):
+  //
+  //   (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t.
+  //
+  // Note: |range| is 20 when bitdepth is 12 and a row transform is performed.
+  //
+  // Assert that this angle is never used by DCT or ADST.
+  assert(angle != 128);
+  if (angle <= 64) return kCos128[angle];
+  if (angle <= 128) return -kCos128[128 - angle];
+  if (angle <= 192) return -kCos128[angle - 128];
+  return kCos128[256 - angle];
+}
+
+inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
+
+// The value for index i is derived as:
+// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
+constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
+
+constexpr uint8_t kTransformRowShift[kNumTransformSizes] = {
+    0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2};
+
+constexpr bool kShouldRound[kNumTransformSizes] = {
+    false, true,  false, true, false, true, false, false, true, false,
+    true,  false, false, true, false, true, false, true,  false};
+
+constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1;
+constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/
+    = 0x6A1;
+constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586;
+constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896;
diff --git a/src/dsp/inverse_transform_test.cc b/src/dsp/inverse_transform_test.cc
new file mode 100644
index 0000000..0ae23df
--- /dev/null
+++ b/src/dsp/inverse_transform_test.cc
@@ -0,0 +1,543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMaxBlockSize = 64;
+constexpr int kTotalPixels = kMaxBlockSize * kMaxBlockSize;
+
+const char* const kTransform1dSizeNames[kNumTransform1dSizes] = {
+    "kTransform1dSize4", "kTransform1dSize8", "kTransform1dSize16",
+    "kTransform1dSize32", "kTransform1dSize64"};
+
+constexpr Transform1dSize kRowTransform1dSizes[] = {
+    kTransform1dSize4,  kTransform1dSize4,  kTransform1dSize4,
+    kTransform1dSize8,  kTransform1dSize8,  kTransform1dSize8,
+    kTransform1dSize8,  kTransform1dSize16, kTransform1dSize16,
+    kTransform1dSize16, kTransform1dSize16, kTransform1dSize16,
+    kTransform1dSize32, kTransform1dSize32, kTransform1dSize32,
+    kTransform1dSize32, kTransform1dSize64, kTransform1dSize64,
+    kTransform1dSize64};
+
+constexpr Transform1dSize kColTransform1dSizes[] = {
+    kTransform1dSize4,  kTransform1dSize8,  kTransform1dSize16,
+    kTransform1dSize4,  kTransform1dSize8,  kTransform1dSize16,
+    kTransform1dSize32, kTransform1dSize4,  kTransform1dSize8,
+    kTransform1dSize16, kTransform1dSize32, kTransform1dSize64,
+    kTransform1dSize8,  kTransform1dSize16, kTransform1dSize32,
+    kTransform1dSize64, kTransform1dSize16, kTransform1dSize32,
+    kTransform1dSize64};
+
+template <int bitdepth, typename SrcPixel, typename DstPixel>
+class InverseTransformTestBase : public testing::TestWithParam<TransformSize>,
+                                 public test_utils::MaxAlignedAllocable {
+ public:
+  InverseTransformTestBase() {
+    switch (tx_size_) {
+      case kNumTransformSizes:
+        EXPECT_NE(tx_size_, kNumTransformSizes);
+        break;
+      default:
+        block_width_ = kTransformWidth[tx_size_];
+        block_height_ = kTransformHeight[tx_size_];
+        break;
+    }
+  }
+
+  InverseTransformTestBase(const InverseTransformTestBase&) = delete;
+  InverseTransformTestBase& operator=(const InverseTransformTestBase&) = delete;
+  ~InverseTransformTestBase() override = default;
+
+ protected:
+  struct InverseTransformMem {
+    void Reset(libvpx_test::ACMRandom* rnd, int width, int height) {
+      ASSERT_NE(rnd, nullptr);
+      // Limit the size of the residual values to bitdepth + sign in order
+      // to prevent outranging in the transforms.
+      const int num_bits = bitdepth + 1;
+      const int sign_shift = (bitdepth == 8 ? 16 : 32) - num_bits;
+      const int mask = (1 << num_bits) - 1;
+      // Fill residual with random data.  For widths == 64, only fill the upper
+      // left 32 x min(block_height_, 32).
+      memset(ref_src, 0, sizeof(ref_src));
+      SrcPixel* r = ref_src;
+      const int stride = width;
+      for (int y = 0; y < std::min(height, 32); ++y) {
+        for (int x = 0; x < std::min(width, 32); ++x) {
+          r[x] = rnd->Rand16() & mask;
+          // The msb of num_bits is the sign bit, so force each 16 bit value to
+          // the correct sign.
+          r[x] = (r[x] << sign_shift) >> sign_shift;
+        }
+        r += stride;
+      }
+
+      // Set frame data to random values.
+      for (int y = 0; y < kMaxBlockSize; ++y) {
+        for (int x = 0; x < kMaxBlockSize; ++x) {
+          const int mask = (1 << bitdepth) - 1;
+          cur_frame[y * kMaxBlockSize + x] = base_frame[y * kMaxBlockSize + x] =
+              rnd->Rand16() & mask;
+        }
+      }
+    }
+
+    // Set ref_src to |pixel|.
+    void Set(const SrcPixel pixel) {
+      for (auto& r : ref_src) r = pixel;
+    }
+
+    alignas(kMaxAlignment) DstPixel base_frame[kTotalPixels];
+    alignas(kMaxAlignment) DstPixel cur_frame[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel base_residual[kTotalPixels];
+    alignas(kMaxAlignment) SrcPixel cur_residual[kTotalPixels];
+
+    alignas(kMaxAlignment) SrcPixel ref_src[kTotalPixels];
+  };
+
+  void SetUp() override { test_utils::ResetDspTable(bitdepth); }
+
+  const TransformSize tx_size_ = GetParam();
+  int block_width_;
+  int block_height_;
+  InverseTransformMem inverse_transform_mem_;
+};
+
+//------------------------------------------------------------------------------
+// InverseTransformTest
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+class InverseTransformTest
+    : public InverseTransformTestBase<bitdepth, Pixel, DstPixel> {
+ public:
+  InverseTransformTest() = default;
+  InverseTransformTest(const InverseTransformTest&) = delete;
+  InverseTransformTest& operator=(const InverseTransformTest&) = delete;
+  ~InverseTransformTest() override = default;
+
+ protected:
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::tx_size_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_width_;
+  using InverseTransformTestBase<bitdepth, Pixel, DstPixel>::block_height_;
+  using InverseTransformTestBase<bitdepth, Pixel,
+                                 DstPixel>::inverse_transform_mem_;
+
+  void SetUp() override {
+    InverseTransformTestBase<bitdepth, Pixel, DstPixel>::SetUp();
+    InverseTransformInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    tx_size_1d_row_ = kRowTransform1dSizes[tx_size_];
+    tx_size_1d_column_ = kColTransform1dSizes[tx_size_];
+
+    memcpy(base_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(base_inverse_transforms_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_inverse_transforms_, 0, sizeof(base_inverse_transforms_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        InverseTransformInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      InverseTransformInit_NEON();
+      InverseTransformInit10bpp_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_inverse_transforms_, dsp->inverse_transforms,
+           sizeof(cur_inverse_transforms_));
+
+    for (int i = 0; i < kNumTransform1ds; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_inverse_transforms_[i][tx_size_1d_row_][kRow] ==
+          base_inverse_transforms_[i][tx_size_1d_row_][kRow]) {
+        cur_inverse_transforms_[i][tx_size_1d_row_][kRow] = nullptr;
+      }
+      if (cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] ==
+          base_inverse_transforms_[i][tx_size_1d_column_][kColumn]) {
+        cur_inverse_transforms_[i][tx_size_1d_column_][kColumn] = nullptr;
+      }
+    }
+
+    base_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                             inverse_transform_mem_.base_frame);
+
+    cur_frame_buffer_.Reset(kMaxBlockSize, kMaxBlockSize,
+                            inverse_transform_mem_.cur_frame);
+  }
+
+  // These tests modify inverse_transform_mem_.
+  void TestRandomValues(int num_tests);
+  void TestDcOnlyRandomValue(int num_tests);
+
+  Array2DView<DstPixel> base_frame_buffer_;
+  Array2DView<DstPixel> cur_frame_buffer_;
+
+  Transform1dSize tx_size_1d_row_ = kTransform1dSize4;
+  Transform1dSize tx_size_1d_column_ = kTransform1dSize4;
+
+  InverseTransformAddFuncs base_inverse_transforms_;
+  InverseTransformAddFuncs cur_inverse_transforms_;
+};
+
+constexpr TransformType kLibgav1TxType[kNumTransformTypes] = {
+    kTransformTypeDctDct,           kTransformTypeAdstDct,
+    kTransformTypeDctAdst,          kTransformTypeAdstAdst,
+    kTransformTypeFlipadstDct,      kTransformTypeDctFlipadst,
+    kTransformTypeFlipadstFlipadst, kTransformTypeAdstFlipadst,
+    kTransformTypeFlipadstAdst,     kTransformTypeIdentityIdentity,
+    kTransformTypeIdentityDct,      kTransformTypeDctIdentity,
+    kTransformTypeIdentityAdst,     kTransformTypeAdstIdentity,
+    kTransformTypeIdentityFlipadst, kTransformTypeFlipadstIdentity};
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr Transform1d kRowTransform[kNumTransformTypes] = {
+    kTransform1dDct,      kTransform1dAdst,     kTransform1dDct,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dDct,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dAdst,
+    kTransform1dIdentity, kTransform1dIdentity, kTransform1dDct,
+    kTransform1dIdentity, kTransform1dAdst,     kTransform1dIdentity,
+    kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr Transform1d kColumnTransform[kNumTransformTypes] = {
+    kTransform1dDct,      kTransform1dDct,      kTransform1dAdst,
+    kTransform1dAdst,     kTransform1dDct,      kTransform1dAdst,
+    kTransform1dAdst,     kTransform1dAdst,     kTransform1dAdst,
+    kTransform1dIdentity, kTransform1dDct,      kTransform1dIdentity,
+    kTransform1dAdst,     kTransform1dIdentity, kTransform1dAdst,
+    kTransform1dIdentity};
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+bool IsTxSizeTypeValid(TransformSize tx_size, TransformType tx_type) {
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  TransformSet tx_set;
+  if (tx_size_square_max > kTransformSize32x32) {
+    tx_set = kTransformSetDctOnly;
+  } else if (tx_size_square_max == kTransformSize32x32) {
+    tx_set = kTransformSetInter3;
+  } else if (tx_size_square_max == kTransformSize16x16) {
+    tx_set = kTransformSetInter2;
+  } else {
+    tx_set = kTransformSetInter1;
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type);
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestRandomValues(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = -1; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = (tx_type_idx == -1)
+                                      ? kTransformTypeDctDct
+                                      : kLibgav1TxType[tx_type_idx];
+    const Transform1d row_transform =
+        (tx_type_idx == -1) ? kTransform1dWht : kRowTransform[tx_type];
+    const Transform1d column_transform =
+        (tx_type_idx == -1) ? kTransform1dWht : kColumnTransform[tx_type];
+
+    // Skip the 'C' test case as this is used as the reference.
+    if (base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        base_inverse_transforms_[column_transform][tx_size_1d_column_]
+                                [kColumn] == nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (!IsTxSizeTypeValid(tx_size_, tx_type)) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      inverse_transform_mem_.Reset(&rnd, block_width_, block_height_);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      const absl::Time base_row_start = absl::Now();
+      base_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      base_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.cur_residual,
+          start_x, start_y, &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from optimized version of "
+                      << ToString(
+                             static_cast<Transform1dSize>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << " tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             (tx_type_idx == -1) ? ToString(row_transform) : ToString(tx_type),
+             kTransform1dSizeNames[tx_size_1d_row_], base_row_elapsed_time_us,
+             cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf(
+          "TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+          (tx_type_idx == -1) ? ToString(column_transform) : ToString(tx_type),
+          kTransform1dSizeNames[tx_size_1d_column_],
+          base_column_elapsed_time_us, cur_column_elapsed_time_us,
+          static_cast<float>(base_column_elapsed_time_us) /
+              static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel, typename DstPixel>
+void InverseTransformTest<bitdepth, Pixel, DstPixel>::TestDcOnlyRandomValue(
+    int num_tests) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+
+  for (int tx_type_idx = 0; tx_type_idx < kNumTransformTypes; ++tx_type_idx) {
+    const TransformType tx_type = kLibgav1TxType[tx_type_idx];
+    const Transform1d row_transform = kRowTransform[tx_type];
+    const Transform1d column_transform = kColumnTransform[tx_type];
+
+    if (cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow] ==
+            nullptr ||
+        cur_inverse_transforms_[column_transform][tx_size_1d_column_]
+                               [kColumn] == nullptr) {
+      continue;
+    }
+
+    // Only test valid tx_size for given tx_type.  See 5.11.40.
+    if (IsTxSizeTypeValid(tx_size_, tx_type) == 0) continue;
+
+    absl::Duration base_elapsed_time[2];
+    absl::Duration cur_elapsed_time[2];
+
+    for (int n = 0; n < num_tests; ++n) {
+      const int tx_height = std::min(block_height_, 32);
+      const int start_x = 0;
+      const int start_y = 0;
+
+      // Using width == 1 and height == 1 will reset only the dc value.
+      inverse_transform_mem_.Reset(&rnd, 1, 1);
+      memcpy(inverse_transform_mem_.base_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+      memcpy(inverse_transform_mem_.cur_residual,
+             inverse_transform_mem_.ref_src,
+             sizeof(inverse_transform_mem_.ref_src));
+
+      // For this test, the "base" contains the output when the
+      // tx_height is set to the max for the given block size.  The
+      // "cur" contains the output when the passed in tx_height is 1.
+      // Compare the outputs for match.
+      const absl::Time base_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kRow] += absl::Now() - base_row_start;
+
+      const absl::Time cur_row_start = absl::Now();
+      cur_inverse_transforms_[row_transform][tx_size_1d_row_][kRow](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kRow] += absl::Now() - cur_row_start;
+
+      const absl::Time base_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, tx_height, inverse_transform_mem_.base_residual,
+          start_x, start_y, &base_frame_buffer_);
+      base_elapsed_time[kColumn] += absl::Now() - base_column_start;
+
+      const absl::Time cur_column_start = absl::Now();
+      cur_inverse_transforms_[column_transform][tx_size_1d_column_][kColumn](
+          tx_type, tx_size_, /*adjusted_tx_height=*/1,
+          inverse_transform_mem_.cur_residual, start_x, start_y,
+          &cur_frame_buffer_);
+      cur_elapsed_time[kColumn] += absl::Now() - cur_column_start;
+
+      if (!test_utils::CompareBlocks(inverse_transform_mem_.base_frame,
+                                     inverse_transform_mem_.cur_frame,
+                                     block_width_, block_height_, kMaxBlockSize,
+                                     kMaxBlockSize, false)) {
+        ADD_FAILURE() << "Result from dc only version of "
+                      << ToString(
+                             static_cast<Transform1dSize>(tx_size_1d_column_))
+                      << " differs from reference in iteration #" << n
+                      << "tx_type_idx:" << tx_type_idx;
+        break;
+      }
+    }
+
+    if (num_tests > 1) {
+      const auto base_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(base_elapsed_time[kRow]));
+      const auto cur_row_elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(cur_elapsed_time[kRow]));
+      printf("TxType %30s[%19s]:: base_row: %5d us  cur_row: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransform1dSizeNames[tx_size_1d_row_],
+             base_row_elapsed_time_us, cur_row_elapsed_time_us,
+             static_cast<float>(base_row_elapsed_time_us) /
+                 static_cast<float>(cur_row_elapsed_time_us));
+      const auto base_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(base_elapsed_time[kColumn]));
+      const auto cur_column_elapsed_time_us = static_cast<int>(
+          absl::ToInt64Microseconds(cur_elapsed_time[kColumn]));
+      printf("TxType %30s[%19s]:: base_col: %5d us  cur_col: %5d us  %2.2fx \n",
+             ToString(tx_type), kTransform1dSizeNames[tx_size_1d_column_],
+             base_column_elapsed_time_us, cur_column_elapsed_time_us,
+             static_cast<float>(base_column_elapsed_time_us) /
+                 static_cast<float>(cur_column_elapsed_time_us));
+    }
+  }
+}
+
+using InverseTransformTest8bpp = InverseTransformTest<8, int16_t, uint8_t>;
+
+TEST_P(InverseTransformTest8bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest8bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest8bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+constexpr TransformSize kTransformSizesAll[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, InverseTransformTest8bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using InverseTransformTest10bpp = InverseTransformTest<10, int32_t, uint16_t>;
+
+TEST_P(InverseTransformTest10bpp, Random) { TestRandomValues(1); }
+
+TEST_P(InverseTransformTest10bpp, DISABLED_Speed) { TestRandomValues(10000); }
+
+TEST_P(InverseTransformTest10bpp, DcRandom) { TestDcOnlyRandomValue(1); }
+
+INSTANTIATE_TEST_SUITE_P(C, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, InverseTransformTest10bpp,
+                         testing::ValuesIn(kTransformSizesAll));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+
+static std::ostream& operator<<(std::ostream& os, const TransformSize param) {
+  return os << ToString(param);
+}
+
+}  // namespace libgav1
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
new file mode 100644
index 0000000..4bd1443
--- /dev/null
+++ b/src/dsp/libgav1_dsp.cmake
@@ -0,0 +1,203 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_
+set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1)
+
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+
+list(APPEND libgav1_dsp_sources
+            "${libgav1_source}/dsp/average_blend.cc"
+            "${libgav1_source}/dsp/average_blend.h"
+            "${libgav1_source}/dsp/cdef.cc"
+            "${libgav1_source}/dsp/cdef.h"
+            "${libgav1_source}/dsp/cdef.inc"
+            "${libgav1_source}/dsp/common.h"
+            "${libgav1_source}/dsp/constants.cc"
+            "${libgav1_source}/dsp/constants.h"
+            "${libgav1_source}/dsp/convolve.cc"
+            "${libgav1_source}/dsp/convolve.h"
+            "${libgav1_source}/dsp/convolve.inc"
+            "${libgav1_source}/dsp/distance_weighted_blend.cc"
+            "${libgav1_source}/dsp/distance_weighted_blend.h"
+            "${libgav1_source}/dsp/dsp.cc"
+            "${libgav1_source}/dsp/dsp.h"
+            "${libgav1_source}/dsp/film_grain.cc"
+            "${libgav1_source}/dsp/film_grain.h"
+            "${libgav1_source}/dsp/film_grain_common.h"
+            "${libgav1_source}/dsp/intra_edge.cc"
+            "${libgav1_source}/dsp/intra_edge.h"
+            "${libgav1_source}/dsp/intrapred_cfl.cc"
+            "${libgav1_source}/dsp/intrapred_cfl.h"
+            "${libgav1_source}/dsp/intrapred_directional.cc"
+            "${libgav1_source}/dsp/intrapred_directional.h"
+            "${libgav1_source}/dsp/intrapred_filter.cc"
+            "${libgav1_source}/dsp/intrapred_filter.h"
+            "${libgav1_source}/dsp/intrapred.cc"
+            "${libgav1_source}/dsp/intrapred.h"
+            "${libgav1_source}/dsp/intrapred_smooth.cc"
+            "${libgav1_source}/dsp/intrapred_smooth.h"
+            "${libgav1_source}/dsp/inverse_transform.cc"
+            "${libgav1_source}/dsp/inverse_transform.h"
+            "${libgav1_source}/dsp/inverse_transform.inc"
+            "${libgav1_source}/dsp/loop_filter.cc"
+            "${libgav1_source}/dsp/loop_filter.h"
+            "${libgav1_source}/dsp/loop_restoration.cc"
+            "${libgav1_source}/dsp/loop_restoration.h"
+            "${libgav1_source}/dsp/mask_blend.cc"
+            "${libgav1_source}/dsp/mask_blend.h"
+            "${libgav1_source}/dsp/motion_field_projection.cc"
+            "${libgav1_source}/dsp/motion_field_projection.h"
+            "${libgav1_source}/dsp/motion_vector_search.cc"
+            "${libgav1_source}/dsp/motion_vector_search.h"
+            "${libgav1_source}/dsp/obmc.cc"
+            "${libgav1_source}/dsp/obmc.h"
+            "${libgav1_source}/dsp/obmc.inc"
+            "${libgav1_source}/dsp/smooth_weights.inc"
+            "${libgav1_source}/dsp/super_res.cc"
+            "${libgav1_source}/dsp/super_res.h"
+            "${libgav1_source}/dsp/warp.cc"
+            "${libgav1_source}/dsp/warp.h"
+            "${libgav1_source}/dsp/weight_mask.cc"
+            "${libgav1_source}/dsp/weight_mask.h")
+
+list(APPEND libgav1_dsp_sources_avx2
+            ${libgav1_dsp_sources_avx2}
+            "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+            "${libgav1_source}/dsp/x86/cdef_avx2.h"
+            "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+            "${libgav1_source}/dsp/x86/convolve_avx2.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
+list(APPEND libgav1_dsp_sources_neon
+            ${libgav1_dsp_sources_neon}
+            "${libgav1_source}/dsp/arm/average_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/average_blend_neon.h"
+            "${libgav1_source}/dsp/arm/cdef_neon.cc"
+            "${libgav1_source}/dsp/arm/cdef_neon.h"
+            "${libgav1_source}/dsp/arm/common_neon.h"
+            "${libgav1_source}/dsp/arm/convolve_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/convolve_neon.cc"
+            "${libgav1_source}/dsp/arm/convolve_neon.h"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h"
+            "${libgav1_source}/dsp/arm/film_grain_neon.cc"
+            "${libgav1_source}/dsp/arm/film_grain_neon.h"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
+            "${libgav1_source}/dsp/arm/intra_edge_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+            "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
+            "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_filter_neon.h"
+            "${libgav1_source}/dsp/arm/loop_restoration_10bit_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.cc"
+            "${libgav1_source}/dsp/arm/loop_restoration_neon.h"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.cc"
+            "${libgav1_source}/dsp/arm/mask_blend_neon.h"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_field_projection_neon.h"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc"
+            "${libgav1_source}/dsp/arm/motion_vector_search_neon.h"
+            "${libgav1_source}/dsp/arm/obmc_neon.cc"
+            "${libgav1_source}/dsp/arm/obmc_neon.h"
+            "${libgav1_source}/dsp/arm/super_res_neon.cc"
+            "${libgav1_source}/dsp/arm/super_res_neon.h"
+            "${libgav1_source}/dsp/arm/warp_neon.cc"
+            "${libgav1_source}/dsp/arm/warp_neon.h"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
+            "${libgav1_source}/dsp/arm/weight_mask_neon.h")
+
+list(APPEND libgav1_dsp_sources_sse4
+            ${libgav1_dsp_sources_sse4}
+            "${libgav1_source}/dsp/x86/average_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/average_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/common_sse4.h"
+            "${libgav1_source}/dsp/x86/cdef_sse4.cc"
+            "${libgav1_source}/dsp/x86/cdef_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.cc"
+            "${libgav1_source}/dsp/x86/convolve_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.inc"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.h"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
+            "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
+            "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
+            "${libgav1_source}/dsp/x86/mask_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc"
+            "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h"
+            "${libgav1_source}/dsp/x86/obmc_sse4.cc"
+            "${libgav1_source}/dsp/x86/obmc_sse4.h"
+            "${libgav1_source}/dsp/x86/super_res_sse4.cc"
+            "${libgav1_source}/dsp/x86/super_res_sse4.h"
+            "${libgav1_source}/dsp/x86/transpose_sse4.h"
+            "${libgav1_source}/dsp/x86/warp_sse4.cc"
+            "${libgav1_source}/dsp/x86/warp_sse4.h"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
+
+macro(libgav1_add_dsp_targets)
+  unset(dsp_sources)
+  list(APPEND dsp_sources ${libgav1_dsp_sources}
+              ${libgav1_dsp_sources_neon}
+              ${libgav1_dsp_sources_avx2}
+              ${libgav1_dsp_sources_sse4})
+
+  libgav1_add_library(NAME
+                      libgav1_dsp
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${dsp_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      $<$<CONFIG:Debug>:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS>
+                      INCLUDES
+                      ${libgav1_include_paths})
+endmacro()
diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc
new file mode 100644
index 0000000..14d47bf
--- /dev/null
+++ b/src/dsp/loop_filter.cc
@@ -0,0 +1,619 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// 7.14.6.1.
+template <int bitdepth, typename Pixel>
+struct LoopFilterFuncs_C {
+  LoopFilterFuncs_C() = delete;
+
+  static constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1));
+  static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1;
+  static constexpr int kFlatThresh = 1 << (bitdepth - 8);
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline void AdjustThresholds(const int bitdepth, int* const outer_thresh,
+                             int* const inner_thresh, int* const hev_thresh) {
+  assert(*outer_thresh >= 7 && *outer_thresh <= 3 * kMaxLoopFilterValue + 4);
+  assert(*inner_thresh >= 1 && *inner_thresh <= kMaxLoopFilterValue);
+  assert(*hev_thresh >= 0 && *hev_thresh <= 3);
+  *outer_thresh <<= bitdepth - 8;
+  *inner_thresh <<= bitdepth - 8;
+  *hev_thresh <<= bitdepth - 8;
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step];
+  return std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 2 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter2_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332]
+  const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255]
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0] = Clip3(q0 - a1, 0, max_unsigned_val);
+}
+
+// 7.14.6.3.
+// 4 pixels in, 4 pixels out.
+template <int bitdepth, typename Pixel>
+inline void Filter4_C(Pixel* p, ptrdiff_t step) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int min_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMinSignedPixel;
+  const int max_signed_val =
+      LoopFilterFuncs_C<bitdepth, Pixel>::kMaxSignedPixel;
+  const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3;
+  const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int a3 = (a1 + 1) >> 1;
+  const int max_unsigned_val = LoopFilterFuncs_C<bitdepth, Pixel>::kMaxPixel;
+  p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val);
+  p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val);
+  p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val);
+  p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical4(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) {
+      if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal4(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) {
+      if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter6(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 6 pixels in, 4 pixels out.
+template <typename Pixel>
+inline void Filter6_C(Pixel* p, ptrdiff_t step) {
+  const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
+  const int a1 = 2 * p1;
+  const int a0 = 2 * p0;
+  const int b0 = 2 * q0;
+  const int b1 = 2 * q1;
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-2 * step] = ApplyFilter6<Pixel>(3 * p2 + a1 + a0 + q0);
+  p[-1 * step] = ApplyFilter6<Pixel>(p2 + a1 + a0 + b0 + q1);
+  p[0 * step] = ApplyFilter6<Pixel>(p1 + a0 + b0 + b1 + q2);
+  p[1 * step] = ApplyFilter6<Pixel>(p0 + b0 + b1 + 3 * q2);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical6(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, 1, flat_thresh)) {
+        Filter6_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal6(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat3(dst, stride, flat_thresh)) {
+        Filter6_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh,
+                         int inner_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p3 - p2) <= inner_thresh &&
+         std::abs(p2 - p1) <= inner_thresh &&
+         std::abs(p1 - p0) <= inner_thresh &&
+         std::abs(q1 - q0) <= inner_thresh &&
+         std::abs(q2 - q1) <= inner_thresh &&
+         std::abs(q3 - q2) <= inner_thresh &&
+         std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh;
+}
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh &&
+         std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh &&
+         std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter8(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 3));
+}
+
+// 7.14.6.4.
+// 8 pixels in, 6 pixels out.
+template <typename Pixel>
+inline void Filter8_C(Pixel* p, ptrdiff_t step) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  // The max is 8 * max_pixel + 4 for the rounder.
+  // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits)
+  p[-3 * step] = ApplyFilter8<Pixel>(3 * p3 + 2 * p2 + p1 + p0 + q0);
+  p[-2 * step] = ApplyFilter8<Pixel>(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1);
+  p[-1 * step] = ApplyFilter8<Pixel>(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2);
+  p[0 * step] = ApplyFilter8<Pixel>(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3);
+  p[1 * step] = ApplyFilter8<Pixel>(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3);
+  p[2 * step] = ApplyFilter8<Pixel>(p0 + q0 + q1 + 2 * q2 + 3 * q3);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical8(void* dest, ptrdiff_t stride,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        Filter8_C(dst, 1);
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal8(void* dest,
+                                                     ptrdiff_t stride,
+                                                     int outer_thresh,
+                                                     int inner_thresh,
+                                                     int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        Filter8_C(dst, stride);
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+// 7.14.6.2.
+template <typename Pixel>
+inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh &&
+         std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh &&
+         std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh;
+}
+
+template <typename Pixel>
+inline Pixel ApplyFilter14(int filter_value) {
+  return static_cast<Pixel>(RightShiftWithRounding(filter_value, 4));
+}
+
+// 7.14.6.4.
+// 14 pixels in, 12 pixels out.
+template <typename Pixel>
+inline void Filter14_C(Pixel* p, ptrdiff_t step) {
+  const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step],
+            p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step],
+            p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step],
+            q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step];
+  // The max is 16 * max_pixel + 8 for the rounder.
+  // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits)
+  p[-6 * step] =
+      ApplyFilter14<Pixel>(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0);
+  p[-5 * step] = ApplyFilter14<Pixel>(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 +
+                                      p1 + p0 + q0 + q1);
+  p[-4 * step] = ApplyFilter14<Pixel>(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 +
+                                      p1 + p0 + q0 + q1 + q2);
+  p[-3 * step] = ApplyFilter14<Pixel>(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                      p1 * 2 + p0 + q0 + q1 + q2 + q3);
+  p[-2 * step] = ApplyFilter14<Pixel>(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                      p0 * 2 + q0 + q1 + q2 + q3 + q4);
+  p[-1 * step] = ApplyFilter14<Pixel>(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                      q0 * 2 + q1 + q2 + q3 + q4 + q5);
+  p[0 * step] = ApplyFilter14<Pixel>(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                     q1 * 2 + q2 + q3 + q4 + q5 + q6);
+  p[1 * step] = ApplyFilter14<Pixel>(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                     q2 * 2 + q3 + q4 + q5 + q6 * 2);
+  p[2 * step] = ApplyFilter14<Pixel>(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                     q3 * 2 + q4 + q5 + q6 * 3);
+  p[3 * step] = ApplyFilter14<Pixel>(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                     q4 * 2 + q5 + q6 * 4);
+  p[4 * step] = ApplyFilter14<Pixel>(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                     q5 * 2 + q6 * 5);
+  p[5 * step] =
+      ApplyFilter14<Pixel>(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7);
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Vertical14(void* dest,
+                                                    ptrdiff_t stride,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, 1, flat_thresh)) {
+        if (IsFlatOuter4(dst, 1, flat_thresh)) {
+          Filter14_C(dst, 1);
+        } else {
+          Filter8_C(dst, 1);
+        }
+      } else if (Hev(dst, 1, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, 1);
+      } else {
+        Filter4_C<bitdepth>(dst, 1);
+      }
+    }
+    dst += stride;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterFuncs_C<bitdepth, Pixel>::Horizontal14(void* dest,
+                                                      ptrdiff_t stride,
+                                                      int outer_thresh,
+                                                      int inner_thresh,
+                                                      int hev_thresh) {
+  const int flat_thresh = LoopFilterFuncs_C<bitdepth, Pixel>::kFlatThresh;
+  AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  for (int i = 0; i < 4; ++i) {
+    if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) {
+      if (IsFlat4(dst, stride, flat_thresh)) {
+        if (IsFlatOuter4(dst, stride, flat_thresh)) {
+          Filter14_C(dst, stride);
+        } else {
+          Filter8_C(dst, stride);
+        }
+      } else if (Hev(dst, stride, hev_thresh)) {
+        Filter2_C<bitdepth>(dst, stride);
+      } else {
+        Filter4_C<bitdepth>(dst, stride);
+      }
+    }
+    ++dst;
+  }
+}
+
+using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>;
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs8bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs8bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void LoopFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+  // Local functions that may be unused depending on the optimizations
+  // available.
+  static_cast<void>(AdjustThresholds);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_filter.h b/src/dsp/loop_filter.h
new file mode 100644
index 0000000..1ddad71
--- /dev/null
+++ b/src/dsp/loop_filter.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters. This function is not thread-safe.
+void LoopFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_FILTER_H_
diff --git a/src/dsp/loop_filter_test.cc b/src/dsp/loop_filter_test.cc
new file mode 100644
index 0000000..d013a1b
--- /dev/null
+++ b/src/dsp/loop_filter_test.cc
@@ -0,0 +1,351 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Horizontal and Vertical need 32x32: 8  pixels preceding filtered section
+//                                     16 pixels within filtered section
+//                                     8  pixels following filtered section
+constexpr int kNumPixels = 1024;
+constexpr int kBlockStride = 32;
+
+constexpr int kNumTests = 50000;
+constexpr int kNumSpeedTests = 500000;
+
+template <typename Pixel>
+void InitInput(Pixel* dst, const int stride, const int bitdepth,
+               libvpx_test::ACMRandom& rnd, const uint8_t inner_thresh,
+               const bool transpose) {
+  const int max_pixel = (1 << bitdepth) - 1;
+  const int pixel_range = max_pixel + 1;
+  Pixel tmp[kNumPixels];
+  auto clip_pixel = [max_pixel](int val) {
+    return static_cast<Pixel>(std::max(std::min(val, max_pixel), 0));
+  };
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp[i++] = rnd(pixel_range);
+    } else {  // 50% chance to repeat previous value in row X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the limit.
+          tmp[i] = clip_pixel(tmp[i - 1] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels;) {
+    const uint8_t val = rnd.Rand8();
+    if (val & 0x80) {
+      ++i;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int j = 0;
+      while (j++ < ((val & 0x1f) + 1) && i < kNumPixels) {
+        if (i < 1) {
+          tmp[i] = rnd(pixel_range);
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] + (inner_thresh - 1));
+        } else {  // Decrement by a value within the inner_thresh.
+          tmp[(i % 32) * 32 + i / 32] = clip_pixel(
+              tmp[((i - 1) % 32) * 32 + (i - 1) / 32] - (inner_thresh - 1));
+        }
+        ++i;
+      }
+    }
+  }
+
+  for (int i = 0; i < kNumPixels; ++i) {
+    const int offset = transpose ? stride * (i % stride) + i / stride : i;
+    dst[i] = tmp[offset];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+class LoopFilterTest : public testing::TestWithParam<LoopFilterSize> {
+ public:
+  LoopFilterTest() = default;
+  LoopFilterTest(const LoopFilterTest&) = delete;
+  LoopFilterTest& operator=(const LoopFilterTest&) = delete;
+  ~LoopFilterTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopFilterInit_C();
+
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    memcpy(base_loop_filters_, dsp->loop_filters[size_],
+           sizeof(base_loop_filters_));
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      memset(base_loop_filters_, 0, sizeof(base_loop_filters_));
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopFilterInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopFilterInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+
+    memcpy(cur_loop_filters_, dsp->loop_filters[size_],
+           sizeof(cur_loop_filters_));
+
+    for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+      // skip functions that haven't been specialized for this particular
+      // architecture.
+      if (cur_loop_filters_[i] == base_loop_filters_[i]) {
+        cur_loop_filters_[i] = nullptr;
+      }
+    }
+  }
+
+  // Check |digests| if non-NULL otherwise print the filter timing.
+  void TestRandomValues(const char* const digests[kNumLoopFilterTypes],
+                        int num_runs) const;
+  void TestSaturatedValues() const;
+
+  const LoopFilterSize size_ = GetParam();
+  LoopFilterFunc base_loop_filters_[kNumLoopFilterTypes];
+  LoopFilterFunc cur_loop_filters_[kNumLoopFilterTypes];
+};
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestRandomValues(
+    const char* const digests[kNumLoopFilterTypes], const int num_runs) const {
+  for (int i = 0; i < kNumLoopFilterTypes; ++i) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    if (cur_loop_filters_[i] == nullptr) continue;
+
+    libvpx_test::MD5 md5_digest;
+    absl::Duration elapsed_time;
+    for (int n = 0; n < num_runs; ++n) {
+      Pixel dst[kNumPixels];
+      const auto outer_thresh = static_cast<uint8_t>(
+          rnd(3 * kMaxLoopFilterValue - 2) + 7);  // [7, 193].
+      const auto inner_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue) + 1);  // [1, 63].
+      const auto hev_thresh =
+          static_cast<uint8_t>(rnd(kMaxLoopFilterValue + 1) >> 4);  // [0, 3].
+      InitInput(dst, kBlockStride, bitdepth, rnd, inner_thresh, (n & 1) == 0);
+
+      const absl::Time start = absl::Now();
+      cur_loop_filters_[i](dst + 8 + kBlockStride * 8, kBlockStride,
+                           outer_thresh, inner_thresh, hev_thresh);
+      elapsed_time += absl::Now() - start;
+
+      md5_digest.Add(reinterpret_cast<const uint8_t*>(dst), sizeof(dst));
+    }
+    if (digests == nullptr) {
+      const auto elapsed_time_us =
+          static_cast<int>(absl::ToInt64Microseconds(elapsed_time));
+      printf("Mode %s[%25s]: %5d us\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), elapsed_time_us);
+    } else {
+      const std::string digest = md5_digest.Get();
+      printf("Mode %s[%25s]: MD5: %s\n",
+             ToString(static_cast<LoopFilterSize>(size_)),
+             ToString(static_cast<LoopFilterType>(i)), digest.c_str());
+      EXPECT_STREQ(digests[i], digest.c_str());
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void LoopFilterTest<bitdepth, Pixel>::TestSaturatedValues() const {
+  const LoopFilterType filter = kLoopFilterTypeHorizontal;
+  if (cur_loop_filters_[filter] == nullptr) return;
+
+  Pixel dst[kNumPixels], ref[kNumPixels];
+  const auto value = static_cast<Pixel>((1 << bitdepth) - 1);
+  for (auto& r : dst) r = value;
+  memcpy(ref, dst, sizeof(dst));
+
+  const int outer_thresh = 24;
+  const int inner_thresh = 8;
+  const int hev_thresh = 0;
+  cur_loop_filters_[filter](dst + 8 + kBlockStride * 8, kBlockStride,
+                            outer_thresh, inner_thresh, hev_thresh);
+  ASSERT_TRUE(test_utils::CompareBlocks(ref, dst, kBlockStride, kBlockStride,
+                                        kBlockStride, kBlockStride, true))
+      << "kLoopFilterTypeHorizontal output doesn't match reference";
+}
+
+//------------------------------------------------------------------------------
+
+using LoopFilterTest8bpp = LoopFilterTest<8, uint8_t>;
+
+const char* const* GetDigests8bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "6ba725d697d6209cb36dd199b8ffb47a",
+      "7dbb20e456ed0501fb4e7954f49f5e18",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "89bb757faa44298b7f6e9c1a67f455a5",
+      "be75d5a2fcd83709ff0845f7d83f7006",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "b09137d68c7b4f8a8a15e33b4b69828f",
+      "ef8a7f1aa073805516d3518a82a5cfa4",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "6a7bc061ace0888275af88093f82ca08",
+      "a957ddae005839aa41ba7691788b01e4",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest8bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest8bpp, FixedInput) {
+  TestRandomValues(GetDigests8bpp(size_), kNumTests);
+}
+
+TEST_P(LoopFilterTest8bpp, SaturatedValues) { TestSaturatedValues(); }
+
+constexpr LoopFilterSize kLoopFilterSizes[] = {
+    kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14};
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest8bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+//------------------------------------------------------------------------------
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using LoopFilterTest10bpp = LoopFilterTest<10, uint16_t>;
+
+const char* const* GetDigests10bpp(LoopFilterSize size) {
+  static const char* const kDigestsSize4[kNumLoopFilterTypes] = {
+      "72e75c478bb130ff1ebfa75f3a70b1a2",
+      "f32d67b611080e0bf1a9d162ff47c133",
+  };
+  static const char* const kDigestsSize6[kNumLoopFilterTypes] = {
+      "8aec73c60c87ac7cc6bc9cc5157a2795",
+      "0e4385d3a0cbb2b1551e05ad2b0f07fb",
+  };
+  static const char* const kDigestsSize8[kNumLoopFilterTypes] = {
+      "85cb2928fae43e1a27b2fe1b78ba7534",
+      "d044fad9d7c64b93ecb60c88ac48e55f",
+  };
+  static const char* const kDigestsSize14[kNumLoopFilterTypes] = {
+      "ebca95ec0db6efbac7ff7cbeabc0e6d0",
+      "754ffaf0ac26a5953a029653bb5dd275",
+  };
+
+  switch (size) {
+    case kLoopFilterSize4:
+      return kDigestsSize4;
+    case kLoopFilterSize6:
+      return kDigestsSize6;
+    case kLoopFilterSize8:
+      return kDigestsSize8;
+    case kLoopFilterSize14:
+      return kDigestsSize14;
+    default:
+      ADD_FAILURE() << "Unknown loop filter size" << size;
+      return nullptr;
+  }
+}
+
+TEST_P(LoopFilterTest10bpp, DISABLED_Speed) {
+  TestRandomValues(nullptr, kNumSpeedTests);
+}
+
+TEST_P(LoopFilterTest10bpp, FixedInput) {
+  TestRandomValues(GetDigests10bpp(size_), kNumTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, LoopFilterTest10bpp,
+                         testing::ValuesIn(kLoopFilterSizes));
+#endif
+#endif
+
+}  // namespace
+
+static std::ostream& operator<<(std::ostream& os, const LoopFilterSize size) {
+  return os << ToString(size);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc
new file mode 100644
index 0000000..2301a3e
--- /dev/null
+++ b/src/dsp/loop_restoration.cc
@@ -0,0 +1,954 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Section 7.17.3.
+// a2: range [1, 256].
+// if (z >= 255)
+//   a2 = 256;
+// else if (z == 0)
+//   a2 = 1;
+// else
+//   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
+// ma = 256 - a2;
+alignas(16) const uint8_t kSgrMaLookup[256] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
+    13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
+    7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,   1,   1,  0};
+
+namespace {
+
+template <int bitdepth, typename Pixel>
+inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
+                             const int width, const int height,
+                             const int16_t* const filter,
+                             const int number_zero_coefficients,
+                             int16_t** wiener_buffer) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int offset =
+      1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  for (int y = 0; y < height; ++y) {
+    int x = 0;
+    do {
+      // sum fits into 16 bits only when bitdepth = 8.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum +=
+            filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
+      }
+      sum += filter[kCenterTap] * source[x + kCenterTap];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
+      (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
+    } while (++x != width);
+    source += source_stride;
+    *wiener_buffer += width;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void WienerVertical(const int16_t* wiener_buffer, const int width,
+                           const int height, const int16_t* const filter,
+                           const int number_zero_coefficients, void* const dest,
+                           const ptrdiff_t dest_stride) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  constexpr int kRoundBitsVertical =
+      (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      // sum needs 32 bits.
+      int sum = 0;
+      for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
+        sum += filter[k] *
+               (wiener_buffer[k * width + x] +
+                wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
+      }
+      sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
+      const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+      dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
+    } while (++x != width);
+    wiener_buffer += width;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+// Note: bit range for wiener filter.
+// Wiener filter process first applies horizontal filtering to input pixels,
+// followed by rounding with predefined bits (dependent on bitdepth).
+// Then vertical filtering is applied, followed by rounding (dependent on
+// bitdepth).
+// The process is the same as convolution:
+// <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
+// --> <rounding 1>
+// By design:
+// (a). horizontal/vertical filtering adds 7 bits to input.
+// (b). The output of first rounding fits into 16 bits.
+// (c). The output of second rounding fits into 16 bits.
+// If input bitdepth > 8, the accumulator of the horizontal filter is larger
+// than 16 bit and smaller than 32 bits.
+// The accumulator of the vertical filter is larger than 16 bits and smaller
+// than 32 bits.
+// Note: range of wiener filter coefficients.
+// Wiener filter coefficients are symmetric, and their sum is 1 (128).
+// The range of each coefficient:
+// filter[0] = filter[6], 4 bits, min = -5, max = 10.
+// filter[1] = filter[5], 5 bits, min = -23, max = 8.
+// filter[2] = filter[4], 6 bits, min = -17, max = 46.
+// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
+// The difference from libaom is that in libaom:
+// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
+// Thus in libaom's computation, an offset of 128 is needed for filter[3].
+template <int bitdepth, typename Pixel>
+void WienerFilter_C(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  constexpr int kCenterTap = kWienerFilterTaps / 2;
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
+
+  // horizontal filtering.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+  const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+  const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
+  auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
+
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 0,
+                                      &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 1,
+                                      &wiener_buffer);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 2,
+                                      &wiener_buffer);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 3,
+                                      &wiener_buffer);
+  }
+
+  // vertical filtering.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer, wiener_buffer - width,
+           sizeof(*wiener_buffer) * width);
+    memcpy(wiener_buffer_org, wiener_buffer_org + width,
+           sizeof(*wiener_buffer) * width);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 0, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 1, dest, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 2, dest, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
+                                    filter_vertical, 3, dest, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel, int size>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sums,
+                                  uint32_t* const* square_sums) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < size; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    (*sums)[0] = sum;
+    (*square_sums)[0] = square_sum;
+    int x = 1;
+    do {
+      const Pixel source0 = src[x - 1];
+      const Pixel source1 = src[x - 1 + size];
+      sum -= source0;
+      sum += source1;
+      square_sum -= source0 * source0;
+      square_sum += source1 * source1;
+      (*sums)[x] = sum;
+      (*square_sums)[x] = square_sum;
+    } while (++x != width);
+    src += src_stride;
+    ++sums;
+    ++square_sums;
+  } while (--y != 0);
+}
+
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
+template <typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
+                                  const int height, const int width,
+                                  uint16_t* const* sum3, uint16_t* const* sum5,
+                                  uint32_t* const* square_sum3,
+                                  uint32_t* const* square_sum5) {
+  int y = height;
+  do {
+    uint32_t sum = 0;
+    uint32_t square_sum = 0;
+    for (int dx = 0; dx < 4; ++dx) {
+      const Pixel source = src[dx];
+      sum += source;
+      square_sum += source * source;
+    }
+    int x = 0;
+    do {
+      const Pixel source0 = src[x];
+      const Pixel source1 = src[x + 4];
+      sum -= source0;
+      square_sum -= source0 * source0;
+      (*sum3)[x] = sum;
+      (*square_sum3)[x] = square_sum;
+      sum += source1;
+      square_sum += source1 * source1;
+      (*sum5)[x] = sum + source0;
+      (*square_sum5)[x] = square_sum + source0 * source0;
+    } while (++x != width);
+    src += src_stride;
+    ++sum3;
+    ++sum5;
+    ++square_sum3;
+    ++square_sum5;
+  } while (--y != 0);
+}
+
+template <int bitdepth, int n>
+inline void CalculateIntermediate(const uint32_t s, uint32_t a,
+                                  const uint32_t b, uint8_t* const ma_ptr,
+                                  uint32_t* const b_ptr) {
+  // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
+  // since max bitdepth = 12, max < 2^31.
+  // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
+  a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
+  // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
+  // d < 2^8 * n < 2^14 regardless of bitdepth
+  const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
+  // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+  // and p itself satisfies p < 2^14 * n^2 < 2^26.
+  // This bound on p is due to:
+  // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+  // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
+  // This is an artifact of rounding, and can only happen if all pixels
+  // are (almost) identical, so in this case we saturate to p=0.
+  const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
+  // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
+  // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
+  // (this holds even after accounting for the rounding in s)
+  const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
+  // ma: range [0, 255].
+  const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
+  const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  // ma < 2^8, b < 2^(bitdepth) * n,
+  // one_over_n = round(2^12 / n)
+  // => the product here is < 2^(20 + bitdepth) <= 2^32,
+  // and b is set to a value < 2^(8 + bitdepth).
+  // This holds even with the rounding in one_over_n and in the overall result,
+  // as long as ma is strictly less than 2^8.
+  const uint32_t b2 = ma * b * one_over_n;
+  *ma_ptr = ma;
+  *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
+}
+
+template <typename T>
+inline uint32_t Sum343(const T* const src) {
+  return 3 * (src[0] + src[2]) + 4 * src[1];
+}
+
+template <typename T>
+inline uint32_t Sum444(const T* const src) {
+  return 4 * (src[0] + src[1] + src[2]);
+}
+
+template <typename T>
+inline uint32_t Sum565(const T* const src) {
+  return 5 * (src[0] + src[2]) + 6 * src[1];
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
+    uint16_t* const ma565, uint32_t* const b565) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 5; ++dy) {
+      a += square_sum5[dy][x];
+      b += sum5[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
+                                        sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma565[x] = Sum565(sgr_buffer->ma + x);
+    b565[x] = Sum565(sgr_buffer->b + x);
+  } while (++x != width);
+}
+
+template <int bitdepth>
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
+    const int width, const uint32_t s, const bool calculate444,
+    SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
+    uint16_t* const ma444, uint32_t* const b444) {
+  int x = 0;
+  do {
+    uint32_t a = 0;
+    uint32_t b = 0;
+    for (int dy = 0; dy < 3; ++dy) {
+      a += square_sum3[dy][x];
+      b += sum3[dy][x];
+    }
+    CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
+                                       sgr_buffer->b + x);
+  } while (++x != width + 2);
+  x = 0;
+  do {
+    ma343[x] = Sum343(sgr_buffer->ma + x);
+    b343[x] = Sum343(sgr_buffer->b + x);
+  } while (++x != width);
+  if (calculate444) {
+    x = 0;
+    do {
+      ma444[x] = Sum444(sgr_buffer->ma + x);
+      b444[x] = Sum444(sgr_buffer->b + x);
+    } while (++x != width);
+  }
+}
+
+template <typename Pixel>
+inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
+                                   const uint32_t b, const int shift) {
+  const int32_t v = b - ma * src;
+  return RightShiftWithRounding(v,
+                                kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <typename Pixel>
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+                                 const uint16_t* const ma565[2],
+                                 const uint32_t* const b565[2],
+                                 const ptrdiff_t x, int p[2]) {
+  p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
+                                        b565[0][x] + b565[1][x], 5);
+  p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
+}
+
+template <typename Pixel>
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+                                const uint16_t* const ma444,
+                                const uint32_t* const b343[3],
+                                const uint32_t* const b444, const ptrdiff_t x) {
+  const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
+  const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
+  return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedFinal(const int src, const int v) {
+  // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
+  // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
+  // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
+  // maximum value of each element.
+  const int s = src + RightShiftWithRounding(
+                          v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
+                                        const int filter1, const int16_t w0,
+                                        const int16_t w2) {
+  const int v = w0 * filter0 + w2 * filter1;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
+                                        const int16_t w0) {
+  const int v = w0 * filter;
+  return SelfGuidedFinal<bitdepth, Pixel>(src, v);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+                           uint16_t* const sum5[5],
+                           uint32_t* const square_sum5[5], const int width,
+                           const uint32_t scale, const int16_t w0,
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma565[2], uint32_t* const b565[2],
+                           Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[1], b565[1]);
+  int x = 0;
+  do {
+    int p[2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+    dst[stride + x] =
+        SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+                           const int width, const uint16_t scale,
+                           const int16_t w0, uint16_t* const sum3[4],
+                           uint32_t* const square_sum3[4],
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma343[4], uint16_t* const ma444[3],
+                           uint32_t* const b343[4], uint32_t* const b444[3],
+                           Pixel* dst) {
+  BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  int x = 0;
+  do {
+    const int p =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+                      uint16_t* const sum3[4], uint16_t* const sum5[5],
+                      uint32_t* const square_sum3[4],
+                      uint32_t* const square_sum5[5], const int width,
+                      const uint16_t scales[2], const int16_t w0,
+                      const int16_t w2, SgrBuffer* const sgr_buffer,
+                      uint16_t* const ma343[4], uint16_t* const ma444[3],
+                      uint16_t* const ma565[2], uint32_t* const b343[4],
+                      uint32_t* const b444[3], uint32_t* const b565[2],
+                      Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[1], b565[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[3], b343[3], ma444[2],
+                                 b444[2]);
+  int x = 0;
+  do {
+    int p[2][2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+    p[1][0] =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+                                          b343 + 1, b444[1], x);
+    dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                         p[1][0], w0, w2);
+    dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+        src[stride + x], p[0][1], p[1][1], w0, w2);
+  } while (++x != width);
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
+                             const Pixel* src, const ptrdiff_t stride,
+                             const Pixel* const top_border,
+                             const ptrdiff_t top_border_stride,
+                             const Pixel* bottom_border,
+                             const ptrdiff_t bottom_border_stride,
+                             const int width, const int height,
+                             SgrBuffer* const sgr_buffer, Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+                square_sum3, square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+                square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[0], b565[0]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+                  width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                  square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                   sgr_buffer, ma565[1], b565[1]);
+    BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
+                                   sgr_buffer, ma343[2], b343[2], nullptr,
+                                   nullptr);
+    int x = 0;
+    do {
+      const int p0 = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+                                                 b444[0], x);
+      dst[x] =
+          SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+                   square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+                     square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+                     1, width + 2, sum5 + 3, square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                   ma565[1], b565[1]);
+    int x = 0;
+    do {
+      const int p = CalculateFilteredOutput<Pixel>(
+          src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
+      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+    } while (++x != width);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const ptrdiff_t sum_stride = temp_stride + 8;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+                   square_sum3);
+  BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const Pixel* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+                                    sum3, square_sum3, sgr_buffer, ma343, ma444,
+                                    b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+                                    square_sum3, sgr_buffer, ma343, ma444, b343,
+                                    b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilter_C(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* src = static_cast<const Pixel*>(source);
+  const auto* top = static_cast<const Pixel*>(top_border);
+  const auto* bottom = static_cast<const Pixel*>(bottom_border);
+  auto* dst = static_cast<Pixel*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2<bitdepth, Pixel>(
+        restoration_info, src - 2, stride, top - 2, top_border_stride,
+        bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+  dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+  dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}  // namespace
+
+void LoopRestorationInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h
new file mode 100644
index 0000000..de80926
--- /dev/null
+++ b/src/dsp/loop_restoration.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/loop_restoration_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
+#include "src/dsp/x86/loop_restoration_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+enum {
+  // Precision of a division table (mtable)
+  kSgrProjScaleBits = 20,
+  kSgrProjReciprocalBits = 12,
+  // Core self-guided restoration precision bits.
+  kSgrProjSgrBits = 8,
+  // Precision bits of generated values higher than source before projection.
+  kSgrProjRestoreBits = 4
+};  // anonymous enum
+
+extern const uint8_t kSgrMaLookup[256];
+
+// Initializes Dsp::loop_restorations. This function is not thread-safe.
+void LoopRestorationInit_C();
+
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+  T* const p0 = p[0];
+  p[0] = p[1];
+  p[1] = p[2];
+  p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+  std::swap(p[0], p[2]);
+  std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+  T* const p0 = p[0];
+  T* const p1 = p[1];
+  p[0] = p[2];
+  p[1] = p[3];
+  p[2] = p[4];
+  p[3] = p0;
+  p[4] = p1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_
diff --git a/src/dsp/loop_restoration_test.cc b/src/dsp/loop_restoration_test.cc
new file mode 100644
index 0000000..4c54bc6
--- /dev/null
+++ b/src/dsp/loop_restoration_test.cc
@@ -0,0 +1,638 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// in unit of Pixel.
+constexpr int kBorder = 16;
+constexpr int kWidth = 256;
+constexpr int kHeight = 255;
+constexpr int kStride = kWidth + 2 * kBorder;
+constexpr int kOffset = kBorder * kStride + kBorder;
+constexpr int kMaxBlockSize = 288 * kStride;
+constexpr int kUnitWidths[] = {32, 64, 128, 256};
+
+constexpr int kNumRadiusTypes = 3;
+constexpr int kNumWienerOrders = 4;
+constexpr int kWienerOrders[] = {7, 5, 3, 1};
+constexpr int kWienerOrderIdLookup[] = {0, 3, 0, 2, 0, 1, 0, 0};
+
+template <int bitdepth, typename Pixel>
+class SelfGuidedFilterTest : public testing::TestWithParam<int>,
+                             public test_utils::MaxAlignedAllocable {
+ public:
+  SelfGuidedFilterTest() = default;
+  SelfGuidedFilterTest(const SelfGuidedFilterTest&) = delete;
+  SelfGuidedFilterTest& operator=(const SelfGuidedFilterTest&) = delete;
+  ~SelfGuidedFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_AVX2();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_SSE4_1();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    target_self_guided_filter_func_ = dsp->loop_restorations[1];
+    restoration_info_.type = kLoopRestorationTypeSgrProj;
+    memset(dst_, 0, sizeof(dst_));
+  }
+
+  void SetInputData(int type, Pixel value, int radius_index,
+                    libvpx_test::ACMRandom* rnd);
+  void TestFixedValues(int test_index, Pixel value);
+  void TestRandomValues(bool speed);
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc target_self_guided_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, int radius_index,
+    libvpx_test::ACMRandom* const rnd) {
+  const int mask = (1 << bitdepth) - 1;
+  if (type == 0) {  // Set fixed values
+    for (auto& s : src_) s = value;
+  } else {  // Set random values
+    for (auto& s : src_) s = rnd->Rand16() & mask;
+  }
+  for (auto& d : dst_) d = rnd->Rand16() & mask;
+  restoration_info_.sgr_proj_info.multiplier[0] =
+      kSgrProjMultiplierMin[0] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[0] - kSgrProjMultiplierMin[0] +
+                         1);
+  restoration_info_.sgr_proj_info.multiplier[1] =
+      kSgrProjMultiplierMin[1] +
+      rnd->PseudoUniform(kSgrProjMultiplierMax[1] - kSgrProjMultiplierMin[1] +
+                         1);
+  // regulate multiplier so that it matches libaom.
+  // Valid self-guided filter doesn't allow r0 and r1 to be 0 at the same time.
+  // When r0 or r1 is zero, its corresponding multiplier is set to zero in
+  // libaom.
+  int index;
+  if (radius_index == 0) {
+    index = 0;  // r0 = 2, r1 = 1
+  } else if (radius_index == 1) {
+    index = 10;  // r0 = 0, r1 = 1
+  } else /* if (radius_index == 2) */ {
+    index = 14;  // r0 = 2, r1 = 0
+  }
+  const uint8_t r0 = kSgrProjParams[index][0];
+  const uint8_t r1 = kSgrProjParams[index][2];
+  static constexpr int kMultiplier[2] = {0, 95};
+  restoration_info_.sgr_proj_info.index = index;
+  if (r0 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[0] = kMultiplier[0];
+  } else if (r1 == 0) {
+    restoration_info_.sgr_proj_info.multiplier[1] = kMultiplier[1];
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestFixedValues(int test_index,
+                                                            Pixel value) {
+  static const char* const kDigest[][2][kNumRadiusTypes] = {
+      {{"7b78783ff4f03625a50c2ebfd574adca", "4faa0810639016f11a9f761ce28c38b0",
+        "a03314fc210bee68c7adbb44d2bbdac7"},
+       {"fce031d1339cfef5016e76a643538a71", "d439e1060de3f07b5b29c9b0b7c08e54",
+        "a6583fe9359877f4a259c81d900fc4fb"}},
+      {{"948ea16a90c4cefef87ce5b0ee105fc6", "76740629877b721432b84dbbdb4e352a",
+        "27100f37b3e42a5f2a051e1566edb6f8"},
+       {"dd320de3bc82f4ba69738b2190ea9f85", "bf82f271e30a1aca91e53b086e133fb3",
+        "69c274ac59c99999e1bfbf2fc4586ebd"}},
+      {{"9fbf1b246011250f38532a543cc6dd74", "d5c1e0142390ebb51b075c49f8ee9ff4",
+        "92f31086ba2f9e1508983b22d93a4e5c"},
+       {"2198321e6b95e7199738e60f5ddc6966", "34f74626027ffca010c824ddf0942b13",
+        "43dd7df2c2a601262c68cd8af1c61b82"}},
+      {{"42364ff8dbdbd6706fa3b8855a4258be", "a7843fdfd4d3c0d80ba812b353b4d6b4",
+        "f8a6a025827f29f857bed3e28ba3ea33"},
+       {"b83c1f8d7712e37f9b21b033822e37ed", "589daf2e3e6f8715873920515cfc1b42",
+        "20dcbe8e317a4373bebf11d56adc5f02"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+    SetInputData(0, value, radius_index, &rnd);
+    const absl::Time start = absl::Now();
+    for (int y = 0; y < kHeight; y += unit_height_) {
+      const int height = std::min(unit_height_, kHeight - y);
+      for (int x = 0; x < kWidth; x += unit_width_) {
+        const int width = std::min(unit_width_, kWidth - x);
+        const Pixel* const source = src + y * kStride + x;
+        target_self_guided_filter_func_(
+            restoration_info_, source, kStride,
+            source - kRestorationVerticalBorder * kStride, kStride,
+            source + height * kStride, kStride, width, height,
+            &restoration_buffer_, dst + y * kStride + x);
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+        kDigest[test_index][bd_index][radius_index], dst_ + kBorder * kStride,
+        kHeight * kStride * sizeof(*dst_), elapsed_time);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void SelfGuidedFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[][2][kNumRadiusTypes] = {
+      {{"9f8358ed820943fa0abe3a8ebb5887db", "fb5d48870165522341843bcbfa8674fb",
+        "ca67159cd29475ac5d52ca4a0df3ea10"},
+       {"a78641886ea0cf8757057d1d91e01434", "1b95172a5f2f9c514c78afa4cf8e5678",
+        "a8ba988283d9e1ad1f0dcdbf6bbdaade"}},
+      {{"f219b445e5c80ffb5dd0359cc2cb4dd4", "699b2c9ddca1cbb0d4fc24cbcbe951e9",
+        "a4005899fa8d3c3c4669910f93ff1290"},
+       {"10a75cab3c78b891c8c6d92d55f685d1", "d46f158f57c628136f6f298ee8ca6e0e",
+        "07203ad761775d5d317f2b7884afd9fe"}},
+      {{"000d4e382be4003b514c9135893d0a37", "8fb082dca975be363bfc9c2d317ae084",
+        "475bcb6a58f87da7723f6227bc2aca0e"},
+       {"4d589683f69ccc5b416149dcc5c835d5", "986b6832df1f6020d50be61ae121e42f",
+        "7cb5c5dbdb3d1c54cfa00def450842dc"}},
+      {{"fd43bfe34d63614554dd29fb24b12173", "5c1ba74ba3062c769d5c3c86a85ac9b9",
+        "f1eda6d15b37172199d9949c2315832f"},
+       {"a11be3117fb77e8fe113581b06f98bd1", "df94d12b774ad5cf744c871e707c36c8",
+        "b23dc0b54c3500248d53377030428a61"}},
+      {{"f3079b3b21d8dc6fce7bb1fd104be359", "c6fcbc686cfb97ab3a64f445d73aad36",
+        "23966cba3e0e7803eeb951905861e0dd"},
+       {"7210391a6fe26e5ca5ea205bc38aa035", "4c3e6eccad3ea152d320ecd1077169de",
+        "dcee48f94126a2132963e86e93dd4903"}}};
+  if (target_self_guided_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  const int num_inputs = speed ? 1 : 5;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 4000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (int i = 0; i < num_inputs; ++i) {
+    for (int radius_index = 0; radius_index < kNumRadiusTypes; ++radius_index) {
+      SetInputData(1, 0, radius_index, &rnd);
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_self_guided_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeSgrProj", std::to_string(GetParam()).c_str(),
+          kDigest[i][bd_index][radius_index], dst_ + kBorder * kStride,
+          kHeight * kStride * sizeof(*dst_), elapsed_time);
+    }
+  }
+}
+
+using SelfGuidedFilterTest8bpp = SelfGuidedFilterTest<8, uint8_t>;
+
+TEST_P(SelfGuidedFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SelfGuidedFilterTest10bpp = SelfGuidedFilterTest<10, uint16_t>;
+
+TEST_P(SelfGuidedFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(SelfGuidedFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SelfGuidedFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename Pixel>
+class WienerFilterTest : public testing::TestWithParam<int>,
+                         public test_utils::MaxAlignedAllocable {
+ public:
+  WienerFilterTest() = default;
+  WienerFilterTest(const WienerFilterTest&) = delete;
+  WienerFilterTest& operator=(const WienerFilterTest&) = delete;
+  ~WienerFilterTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    LoopRestorationInit_C();
+    const Dsp* const dsp = GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    base_wiener_filter_func_ = dsp->loop_restorations[0];
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "AVX2/")) {
+      if ((GetCpuInfo() & kAVX2) != 0) {
+        LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_AVX2();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        LoopRestorationInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        LoopRestorationInit10bpp_SSE4_1();
+#endif
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      LoopRestorationInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_NEON();
+#endif
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    target_wiener_filter_func_ = dsp->loop_restorations[0];
+    restoration_info_.type = kLoopRestorationTypeWiener;
+    memset(dst_, 0, sizeof(dst_));
+    memset(tmp_, 0, sizeof(tmp_));
+    memset(buffer_, 0, sizeof(buffer_));
+  }
+
+  static void CleanFilterByOrder(const int order,
+                                 int16_t filter[kWienerFilterTaps]) {
+    if (order <= 5) filter[0] = 0;
+    if (order <= 3) filter[1] = 0;
+    if (order <= 1) filter[2] = 0;
+  }
+
+  void SetInputData(int type, Pixel value, int vertical_order,
+                    int horizontal_order);
+  void TestFixedValues(int digest_id, Pixel value);
+  void TestRandomValues(bool speed);
+  void TestCompare2C();
+
+ protected:
+  const int unit_width_ = GetParam();
+  const int unit_height_ = kRestorationUnitHeight;
+
+ private:
+  alignas(kMaxAlignment)
+      uint16_t buffer_[(kRestorationUnitWidth + kWienerFilterTaps - 1) *
+                       kRestorationUnitHeight];
+  alignas(kMaxAlignment) Pixel src_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel dst_[kMaxBlockSize];
+  alignas(kMaxAlignment) Pixel tmp_[kMaxBlockSize];
+  RestorationUnitInfo restoration_info_;
+  RestorationBuffer restoration_buffer_;
+  LoopRestorationFunc base_wiener_filter_func_;
+  LoopRestorationFunc target_wiener_filter_func_;
+};
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::SetInputData(
+    int type, Pixel value, const int vertical_order,
+    const int horizontal_order) {
+  const int mask = (1 << bitdepth) - 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  if (type == 0) {
+    for (auto& s : src_) s = value;
+  } else {
+    for (auto& s : src_) s = rnd.Rand16() & mask;
+  }
+  int order = vertical_order;
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    auto& filter = restoration_info_.wiener_info.filter[i];
+    filter[3] = 128;
+    for (int j = 0; j < 3; ++j) {
+      filter[j] = kWienerTapsMin[j] +
+                  rnd.PseudoUniform(kWienerTapsMax[j] - kWienerTapsMin[j] + 1);
+    }
+    CleanFilterByOrder(order, filter);
+    filter[3] -= 2 * (filter[0] + filter[1] + filter[2]);
+    restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+        (kWienerFilterTaps - order) / 2;
+    order = horizontal_order;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestFixedValues(int digest_id,
+                                                        Pixel value) {
+  static const char* const kDigest[2][4] = {
+      {"74fc90760a14b13340cb718f200ba350", "5bacaca0128cd36f4805330b3787771d",
+       "1109e17545cc4fbd5810b8b77e19fc36", "e7f914ec9d065aba92338016e17a526c"},
+      {"c8cc38790ceb0bea1eb989686755e1e5", "70f573b7e8875262c638a68d2f317916",
+       "193b19065899c835cb513149eb36d135", "f1dff65e3e53558b303ef0a2e3f3ba98"}};
+  if (target_wiener_filter_func_ == nullptr) return;
+  ASSERT_LT(value, 1 << bitdepth);
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(0, value, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int y = 0; y < kHeight; y += unit_height_) {
+        const int height = std::min(unit_height_, kHeight - y);
+        for (int x = 0; x < kWidth; x += unit_width_) {
+          const int width = std::min(unit_width_, kWidth - x);
+          const Pixel* const source = src + y * kStride + x;
+          target_wiener_filter_func_(
+              restoration_info_, source, kStride,
+              source - kRestorationVerticalBorder * kStride, kStride,
+              source + height * kStride, kStride, width, height,
+              &restoration_buffer_, dst + y * kStride + x);
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][digest_id], dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestRandomValues(bool speed) {
+  static const char* const kDigest[2][kNumWienerOrders][kNumWienerOrders] = {
+      {{"40d0cf56d2ffb4f581e68b0fc97f547f", "5c04745209b684ba98004ebb0f64e70b",
+        "545ed7d3f7e7ca3b86b4ada31f7aaee7", "0d6b2967f1bd1d99b720e563fe0cf03f"},
+       {"44b37076f0cf27f6eb506aca50c1d3e4", "e927d64dc9249e05a65e10ee75baa7d9",
+        "6136ecb4e29b17c9566504148943fd47", "c5ee2da81d44dc8cb2ac8021f724eb7a"},
+       {"125cbb227313ec91a2683f26e6f049d1", "77671b6529c806d23b749f304b548f59",
+        "28d53a1b486881895b8f73fa64486df1", "f5e32165bafe575d7ee7a6fbae75f36d"},
+       {"e832c41f2566ab542b32abba9d4f27bd", "ab1336ee6b85cba651f35ee5d3b3cc5c",
+        "52a673b6d14fbdca5ebdb1a34ee3326f",
+        "ebb42c7c9111f2e39f21e2158e801d9e"}},
+      {{"8cd9c6bd9983bd49564a58ed4af9098a", "f71f333c9d71237ed4e46f0ef2283196",
+        "375b43abc1d6682d62f91c1841b8b0fc", "71e2444822ae9c697ddfc96e07c6e8a1"},
+       {"d9ed3a66ceef405c08c87f6e91b71059", "c171fcff5fb7bb919f13ead7a4917a4c",
+        "8fbd1edb82fcd78d4d286886f65a700a", "fe14a143e6b261c5bb07b179d40be5a2"},
+       {"1c995f4e7f117857de73211b81093bd0", "5ab1ee3bb14adcd66d66802d58bee068",
+        "d77430783e173ebd1b30e5d9336c8b69", "e159a3620747458dff7ed3d20da1a4b7"},
+       {"5346fa07d195c257548a332753b057a3", "c77674bc0a638abc4d38d58e494fc7cf",
+        "7cbc1562a9dd08e1973b3b9ac1afc765",
+        "3c91bf1a34672cd40bf261c5820d3ec3"}}};
+  if (target_wiener_filter_func_ == nullptr) return;
+  constexpr int bd_index = (bitdepth == 8) ? 0 : 1;
+#if LIBGAV1_ENABLE_NEON
+  const int num_tests = speed ? 5000 : 1;
+#else
+  const int num_tests = speed ? 10000 : 1;
+#endif
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      memset(dst_, 0, sizeof(dst_));
+      const absl::Time start = absl::Now();
+      for (int i = 0; i < num_tests; ++i) {
+        for (int y = 0; y < kHeight; y += unit_height_) {
+          const int height = std::min(unit_height_, kHeight - y);
+          for (int x = 0; x < kWidth; x += unit_width_) {
+            const int width = std::min(unit_width_, kWidth - x);
+            const Pixel* const source = src + y * kStride + x;
+            target_wiener_filter_func_(
+                restoration_info_, source, kStride,
+                source - kRestorationVerticalBorder * kStride, kStride,
+                source + height * kStride, kStride, width, height,
+                &restoration_buffer_, dst + y * kStride + x);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "kLoopRestorationTypeWiener", std::to_string(GetParam()).c_str(),
+          kDigest[bd_index][kWienerOrderIdLookup[vertical_order]]
+                 [kWienerOrderIdLookup[horizontal_order]],
+          dst_, sizeof(dst_), elapsed_time);
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void WienerFilterTest<bitdepth, Pixel>::TestCompare2C() {
+  if (base_wiener_filter_func_ == nullptr) return;
+  if (target_wiener_filter_func_ == nullptr) return;
+  if (base_wiener_filter_func_ == target_wiener_filter_func_) return;
+  const Pixel* const src = src_ + kOffset;
+  Pixel* const dst = dst_ + kOffset;
+  Pixel* const tmp = tmp_ + kOffset;
+  for (const auto vertical_order : kWienerOrders) {
+    for (const auto horizontal_order : kWienerOrders) {
+      SetInputData(1, (1 << bitdepth) - 1, vertical_order, horizontal_order);
+      for (int x = 0; x < 2; ++x) {
+        // Prepare min/max filter coefficients.
+        int order = vertical_order;
+        for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+          auto& filter = restoration_info_.wiener_info.filter[i];
+          for (int j = 0; j < 3; ++j) {
+            filter[j] = (x == 0) ? kWienerTapsMin[j] : kWienerTapsMax[j];
+          }
+          CleanFilterByOrder(order, filter);
+          filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]);
+          restoration_info_.wiener_info.number_leading_zero_coefficients[i] =
+              (kWienerFilterTaps - order) / 2;
+          order = horizontal_order;
+        }
+        base_wiener_filter_func_(restoration_info_, src, kStride,
+                                 src - kRestorationVerticalBorder * kStride,
+                                 kStride, src + unit_height_ * kStride, kStride,
+                                 unit_width_, unit_height_,
+                                 &restoration_buffer_, dst);
+        target_wiener_filter_func_(restoration_info_, src, kStride,
+                                   src - kRestorationVerticalBorder * kStride,
+                                   kStride, src + unit_height_ * kStride,
+                                   kStride, unit_width_, unit_height_,
+                                   &restoration_buffer_, tmp);
+        if (!test_utils::CompareBlocks(dst, tmp, unit_width_, unit_height_,
+                                       kStride, kStride, false, true)) {
+          ADD_FAILURE() << "Mismatch -- wiener taps min/max";
+        }
+      }
+    }
+  }
+}
+
+using WienerFilterTest8bpp = WienerFilterTest<8, uint8_t>;
+
+TEST_P(WienerFilterTest8bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 128);
+  TestFixedValues(3, 255);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest8bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest8bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest8bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WienerFilterTest10bpp = WienerFilterTest<10, uint16_t>;
+
+TEST_P(WienerFilterTest10bpp, Correctness) {
+  TestFixedValues(0, 0);
+  TestFixedValues(1, 1);
+  TestFixedValues(2, 512);
+  TestFixedValues(3, 1023);
+  TestRandomValues(false);
+}
+
+TEST_P(WienerFilterTest10bpp, DISABLED_Speed) { TestRandomValues(true); }
+
+TEST_P(WienerFilterTest10bpp, TestCompare2C) { TestCompare2C(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+
+#if LIBGAV1_ENABLE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WienerFilterTest10bpp,
+                         testing::ValuesIn(kUnitWidths));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc
new file mode 100644
index 0000000..207fde0
--- /dev/null
+++ b/src/dsp/mask_blend.cc
@@ -0,0 +1,212 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+uint8_t GetMaskValue(const uint8_t* LIBGAV1_RESTRICT mask,
+                     const uint8_t* LIBGAV1_RESTRICT mask_next_row, int x,
+                     int subsampling_x, int subsampling_y) {
+  if ((subsampling_x | subsampling_y) == 0) {
+    return mask[x];
+  }
+  if (subsampling_x == 1 && subsampling_y == 0) {
+    return static_cast<uint8_t>(RightShiftWithRounding(
+        mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1));
+  }
+  assert(subsampling_x == 1 && subsampling_y == 1);
+  return static_cast<uint8_t>(RightShiftWithRounding(
+      mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] +
+          mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1],
+      2));
+}
+
+template <int bitdepth, typename Pixel, bool is_inter_intra, int subsampling_x,
+          int subsampling_y>
+void MaskBlend_C(const void* LIBGAV1_RESTRICT prediction_0,
+                 const void* LIBGAV1_RESTRICT prediction_1,
+                 const ptrdiff_t prediction_stride_1,
+                 const uint8_t* LIBGAV1_RESTRICT mask,
+                 const ptrdiff_t mask_stride, const int width, const int height,
+                 void* LIBGAV1_RESTRICT dest, const ptrdiff_t dest_stride) {
+  static_assert(!(bitdepth == 8 && is_inter_intra), "");
+  assert(mask != nullptr);
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  auto* dst = static_cast<Pixel*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  // 7.11.3.2 Rounding variables derivation process
+  //   2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7))
+  constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+      if (is_inter_intra) {
+        dst[x] = static_cast<Pixel>(RightShiftWithRounding(
+            mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
+      } else {
+        assert(prediction_stride_1 == width);
+        int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+        res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+        dst[x] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+                  (1 << bitdepth) - 1));
+      }
+    }
+    dst += dst_stride;
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    pred_0 += width;
+    pred_1 += prediction_stride_1;
+  }
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_C(const uint8_t* LIBGAV1_RESTRICT prediction_0,
+                               uint8_t* LIBGAV1_RESTRICT prediction_1,
+                               const ptrdiff_t prediction_stride_1,
+                               const uint8_t* LIBGAV1_RESTRICT mask,
+                               const ptrdiff_t mask_stride, const int width,
+                               const int height) {
+  assert(mask != nullptr);
+  constexpr int step_y = subsampling_y ? 2 : 1;
+  const uint8_t* mask_next_row = mask + mask_stride;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value =
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
+      prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
+          mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
+          6));
+    }
+    mask += mask_stride * step_y;
+    mask_next_row += mask_stride * step_y;
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+  dsp->mask_blend[0][1] = nullptr;
+  dsp->mask_blend[1][1] = nullptr;
+  dsp->mask_blend[2][1] = nullptr;
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
+#endif
+  static_cast<void>(GetMaskValue);
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+  dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+  dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+  dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+  dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+  dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+  dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>;
+#endif
+  // These are only used with 8-bit.
+  dsp->inter_intra_mask_blend_8bpp[0] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[1] = nullptr;
+  dsp->inter_intra_mask_blend_8bpp[2] = nullptr;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void MaskBlendInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/mask_blend.h b/src/dsp/mask_blend.h
new file mode 100644
index 0000000..41f5e5b
--- /dev/null
+++ b/src/dsp/mask_blend.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_
+#define LIBGAV1_SRC_DSP_MASK_BLEND_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/mask_blend_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/mask_blend_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This
+// function is not thread-safe.
+void MaskBlendInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MASK_BLEND_H_
diff --git a/src/dsp/mask_blend_test.cc b/src/dsp/mask_blend_test.cc
new file mode 100644
index 0000000..be80b11
--- /dev/null
+++ b/src/dsp/mask_blend_test.cc
@@ -0,0 +1,525 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+// mask_blend is applied to compound prediction values when is_inter_intra is
+// false. This implies a range far exceeding that of pixel values. The ranges
+// include kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "4b70d5ef5ac7554b4b2660a4abe14a41", "64adb36f07e4a2c4ea4f05cfd715ff58",
+      "2cd162cebf99724a3fc22d501bd8c8e4", "c490478208374a43765900ef7115c264",
+      "b98f222eb70ef8589da2d6c839ca22b8", "54752ca05f67b5af571bc311aa4e3de3",
+      "5ae48814dd285bfca4f5ee8e339dca99", "383f3f4f47563f065d1b6068e5931a24",
+      "344b2dab7accd8bd0a255bee16207336", "0b2f6f755d1547eea7e0172f8133ea01",
+      "310dc6364fdacba186c01f0e8ac4fcb7", "c2ee4673078d34971319c77ca77b23d1",
+      "b0c9f08b73d9e5c16eaf5abdbca1fdc0", "eaad805999d949fa1e1bbbb63b4b7827",
+      "6eb2a80d212df89403efb50db7a81b08", "c30730aa799dba78a2ebd3f729af82c7",
+      "4346c2860b23f0072b6b288f14c1df36", "1cdace53543063e129a125c4084ca5d7",
+      "1ae5328e0c0f4f2bec640d1af03b2978", "3860e040fbee0c5f68f0b4af769209b3",
+      "e9480ded15d9c38ee19bf5fa816dd296", "4e17c222b64f428df29938a8120ca256",
+      "2a943bc6de9b29c8bcae189ad3bec276", "b5a6bc02c76fa61040678fb2c6c112d2",
+      "2c11bb9bd29c5577194edb77cfd1c614", "31ed1832810ae385f4ad8f57795dde1e",
+      "eb87d647839c33984dfb25bac0e7cdb3", "f652ec2b1478e35acb19cf28042ee849",
+      "0cfb18ac0cb94af1447bcac32ac20c36", "e152bbbf5ee4b40b7b41ec1f2e901aaa",
+      "f17f78fd485f7beafa8126c1cda801d7", "9f9fbee0cc9d99435efd3dff644be273",
+      "9b498843d66440c1e68dc7ab04f57d42", "2f2b0beceb31b79ccb9179991629e4b8",
+      "e06a6ebb6791529bb23fe5b0a9914220", "2b3d1ff19812a17c17b1be1f1727815e",
+      "d0bbdecec414950ed63a8a35c2bae397", "8e53906c6513058d7f17013fe0d32bf1",
+      "be0690efd31f0bf3c2adcd27ca011ed5", "c2b26243c5f147fdeadf52735aa68fb5",
+      "94bb83e774d9189c5ee04fb361855e19", "dad6441e723791a91f31a56b2136fd33",
+      "10ccac76a2debb842a0685a527b6a659", "346fb0a4914b64dda3ca0f521412b999",
+      "d7e400b855502bbb4f2b8294e207bb96", "3487503f2d73ec52f25b5e8d06c81da4",
+      "3f49c096acfcf46d44ce18b48debca7c", "8ed6a745a2b5457ac7f3ac145ce57e72",
+      "21f9dda5ef934a5ee6274b22cc22f93b", "507b60611afeb373384d9b7606f7ea46",
+      "ac766fadcdb85a47ad14a6846b9e5c36", "fde149bc2162e02bbc5fa85cc41641a5",
+      "f5f094b5742d0a920ba734b017452d24", "c90d06b0c76a0983bd1428df2a1b64b3",
+      "3649e6a6ed9f69e3f78e0b75160fb82a", "1d44b7649497e651216db50d325e3073",
+      "948fa112e90e3ca4d15f3d2f2acfab9a", "9bb54c0f7d07c0b44c44ba09379a04ff",
+      "228261ab6f098f489a8968cff1e1f7ae", "5e128db7462164f7327d1d8feeb2e4c7",
+      "9e8b97f6d9d482d5770b138bd1077747", "81563d505a4e8dd779a089abf2a28b77",
+      "b7157451de7cfa161dff1afd7f9b8622", "6a25cc0a4aaf8a315d1158dbb0ec2966",
+      "303867ee010ba51da485ee10149c6f9b", "63b64b7527d2476e9ae5139b8166e8c9",
+      "cfa93c2aeeb27a1190a445a6fee61e15", "804bcff8709665eed6830e24346101be",
+      "829947ed3e90776cda4ae82918461497", "1df10a1cb80c1a81f521e7e0f80b4f99",
+      "3c9593e42ac574f3555bb8511d438a54", "eecef71492c0626685815e646f728f79",
+      "0c43d59f456ddca2449e016ae4e34be7", "207d4ac2579f1271fc9eca8d743917b3",
+      "3c472bb0b1c891ffda19077ebb659e48", "a4ae7a0d25113bc0238fa27409f9c0dd",
+      "e8ad037ca81f46774bb01d20f46671ce", "b22741e4fe0e4062e40a2decec102ffd",
+      "c72f9e7bc0170163cb94da0faa0d3ffb", "accaf5d475d155cbd3a8c113f90718bc",
+      "2fd31e72444ea258380c16881580de81", "8a6a2a253f6f5b0ff75ba39488e6b082",
+      "c5e8159c0f3ebb7536e84ab3dadac1b3", "ef7ec20b46c7dcf16591835642bd68ef",
+      "0c3425399dc64870d726c2837666a55e", "0365029ffbfc4cedf3bf2d757ea5b9df",
+      "836aa403254af2e04d4b7a7c4db8bfc5", "7f2f3f9c91677b233795169f9a88b2b2",
+      "9fc8bbe787244dac638c367b9c611d13", "f66ef45fae8e163ab0f0f393531dad26",
+      "beb984e88b6f9b96ae6efe5da23ad16b", "1083b829ea766b1d4eb0bb96e9fb3bff",
+      "be8abad1da69e4d238a45fc02a0061cf",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1af3cbd1616941b59e6a3f6a417b6312", "1d8b3f4b9d5d2f4ff5be8e81b7243121",
+      "e767350f150a84ac5a06dc348e815d62", "53a3a76bf2bcd5761cd15fc739a4f4e1",
+      "7597f69dc19a584280be0d67911db6a6", "e1221c172843dc6c1b345bcd370771cc",
+      "1a640c71ff9bb45505d89761f19efa8f", "e192f64322e0edb250b52f63aaa4de97",
+      "2ccbe012ca167114b14c3ba70befa960", "0f68632d7e5faddb4554ca430d1df822",
+      "8caa0061a26e142b783951d5abd7bf5d", "b01eeed3ec549e4a593100d9c5ba587a",
+      "1cce6acdbd8ca8d2546ba937584730bf", "022913e87a3c1a86aaefe2c2d4f89882",
+      "48f8ab636ba15a06731d869b603cbe58", "ba1616c990d224c20de123c3ccf19952",
+      "346a797b7cb4de10759e329f8b49e077", "d4929154275255f2d786d6fc42c7c5d3",
+      "18a6af6f36ca1ea4ab6f5a76505de040", "0c43e68414bfc02f9b20e796506f643b",
+      "9f483f543f6b1d58e23abf9337ed6fe6", "e114860c2538b63f1be4a23560420cdc",
+      "da8680798f96572c46155c7838b452c3", "20b47a27617297231843c0f2ed7b559b",
+      "16fa4a4f33a32e28c79da83dca63fd41", "76e2c1d3c323777a3c478e11e1ba6bf2",
+      "dccdfd52a71855cc4da18af52bda4c03", "121befbd6c246e85a34225241b8bcaf1",
+      "5780757555fd87ca1ff3f1b498a1d6e9", "6b0be2256285694b1edc0201608e1326",
+      "b7ef338c58d17f69426b5a99170c7295", "b92b84b5b3d01afac02fb9c092e84b06",
+      "e6ef7fea8b183f871c4306c4f49370c5", "c1bf95c05774d8471504e57a3efa66e4",
+      "bbacdbdafc625a139361ec22fe2cf003", "5fbbb2d6ca8fc6d07ca8d4105fda4a01",
+      "c1cbb295d9f00aa865d91a95e96f99b2", "1490e4f2c874a76ecc2bbf35dce446c3",
+      "c3bd73daaeec39895a8b64812773c93c", "6d385068ef3afbd821183d36851f709b",
+      "a34c52ef7f2fd04d1cd420238641ef48", "45d10029358c6835cf968a30605659ea",
+      "a72c1bb18cf9312c5713ce0de370743d", "df7368db2a7515a1c06a4c9dd9e32ebf",
+      "52782632271caccfa9a35ed7533e2052", "6f0ef9b62d2b9956a6464694b7a86b79",
+      "814dbc176f7201725a1cfd1cf668b4b9", "065ffbee984f4b9343c8acb0eb04fcbe",
+      "0915d76ce458d5164e3c90c1ce150795", "bf2b431d9bfa7a9925ea6f6509267ae9",
+      "d3df8c0c940a01b7bf3c3afb80b6dcd4", "15ab86216c9856a8427a51fe599258a3",
+      "2cb078484472c88e26b7401c9f11cf51", "7c5f68cc098c8adabc9e26f9cd549151",
+      "a8e47da1fcc91c2bc74d030892621576", "71af422ba2d86a401f8278591c0ef540",
+      "964c902bb4698ce82f4aa0a1edc80cd6", "78271c37d62af86576dab72ed59746b3",
+      "7247c3a7534a41137027e7d3f255f5ef", "8e529ab964f5f9d0f7c3ced98239cfc8",
+      "2481ed50bff6b36a3cac6dca2aca5ae5", "78a1ff18bf217d45f5170675dee26948",
+      "00fc534119c13aa7af4b818cad9218a2", "67501a83c93f2f9debfa86955bdffde5",
+      "2a512ef738e33a4d8476f72654deffb4", "f4eef28078bbc12de9cfb5bc2fef6238",
+      "b7ac3a35205a978bed587356155bae0e", "51ea101f09c4de2f754b61ab5aff1526",
+      "2bd689d7ec964ee8c8f6f0682f93f5ca", "eecac8dbdaa73b8b3c2234892c444147",
+      "cb7086f44ef70ef919086a3d200d8c13", "0abe35e3c796c2de1e550426b2b19441",
+      "0eb140561e1ea3843464a5247d8ecb18", "d908f7317f00daacbe3dd43495db64ad",
+      "d4d677c4b347de0a13ccab7bc16b8e6e", "26523c2c2df7f31896a3ae5aa24d5ada",
+      "0ebb9f816684769816b2ae0b1f94e3a4", "fd938d0577e3687b0a810e199f69f0bb",
+      "eb8fb832e72030e2aa214936ae0effe4", "56631887763f7daf6e1e73783e5ff656",
+      "590a25cc722c2aa4d885eede5ef09f20", "80944a218ed9b9b0374cde72914449eb",
+      "d9cbc2f1e0e56cdd6722310932db1981", "a88eb213b7a6767bbe639cda120a4ab6",
+      "9972ecbadfdf3ed0b3fedf435c5a804f", "01fdf7e22405a1b17a8d275b7451094f",
+      "6a7824e10406fade0d032e886bbc76b6", "76fefadd793ec3928e915d92782bc7e1",
+      "0fbd6b076752c9f5c926ca5c1df892ac", "aac9457239f07ad633fcd45c1465af2a",
+      "56823ef9a8e21c9c7441cc9ed870d648", "52f4c7a0b7177175302652cbc482f442",
+      "f4a4f4d7c8b93c0486cf3cbaa26fbc19",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct MaskBlendTestParam {
+  MaskBlendTestParam(BlockSize block_size, int subsampling_x, int subsampling_y,
+                     bool is_inter_intra, bool is_wedge_inter_intra)
+      : block_size(block_size),
+        width(kBlockWidthPixels[block_size]),
+        height(kBlockHeightPixels[block_size]),
+        subsampling_x(subsampling_x),
+        subsampling_y(subsampling_y),
+        is_inter_intra(is_inter_intra),
+        is_wedge_inter_intra(is_wedge_inter_intra) {}
+  BlockSize block_size;
+  int width;
+  int height;
+  int subsampling_x;
+  int subsampling_y;
+  bool is_inter_intra;
+  bool is_wedge_inter_intra;
+};
+
+std::ostream& operator<<(std::ostream& os, const MaskBlendTestParam& param) {
+  return os << ToString(param.block_size)
+            << ", subsampling(x/y): " << param.subsampling_x << "/"
+            << param.subsampling_y
+            << ", is_inter_intra: " << param.is_inter_intra
+            << ", is_wedge_inter_intra: " << param.is_wedge_inter_intra;
+}
+
+template <int bitdepth, typename Pixel>
+class MaskBlendTest : public testing::TestWithParam<MaskBlendTestParam>,
+                      public test_utils::MaxAlignedAllocable {
+ public:
+  MaskBlendTest() = default;
+  ~MaskBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    MaskBlendInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MaskBlendInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MaskBlendInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = (param_.is_inter_intra && !param_.is_wedge_inter_intra)
+                ? dsp->mask_blend[0][param_.is_inter_intra]
+                : dsp->mask_blend[param_.subsampling_x + param_.subsampling_y]
+                                 [param_.is_inter_intra];
+    func_8bpp_ = dsp->inter_intra_mask_blend_8bpp[param_.is_wedge_inter_intra
+                                                      ? param_.subsampling_x +
+                                                            param_.subsampling_y
+                                                      : 0];
+  }
+
+ protected:
+  int GetDigestIdOffset() const {
+    // id is for retrieving the corresponding digest from the lookup table given
+    // the set of input parameters. id can be figured out by the block size and
+    // an offset (id_offset).
+    // For example, in kMaskBlendTestParam, this set of parameters
+    // (8, 8, 0, 0, false, false) corresponds to the first entry in the
+    // digest lookup table, where id == 0.
+    // (8, 8, 1, 0, false, false) corresponds to id == 17.
+    // (8, 8, 1, 1, false, false) corresponds to id == 34.
+    // (8, 8, 0, 0, true, false) corresponds to id == 51.
+    // Id_offset denotes offset for different modes (is_inter_intra,
+    // is_wedge_inter_intra).
+    // ...
+    if (!param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return param_.subsampling_x * 17 + param_.subsampling_y * 17;
+    }
+    if (param_.is_inter_intra && !param_.is_wedge_inter_intra) {
+      return 51 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    if (param_.is_inter_intra && param_.is_wedge_inter_intra) {
+      return 72 + param_.subsampling_x * 7 + param_.subsampling_y * 7;
+    }
+    return 0;
+  }
+
+  int GetDigestId() const {
+    // Only 8x8 and larger blocks are tested.
+    int block_size_adjustment =
+        static_cast<int>(param_.block_size > kBlock16x4);
+    if (param_.is_inter_intra || param_.is_wedge_inter_intra) {
+      // 4:1/1:4 blocks are invalid for these modes.
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock8x32);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock16x64);
+      block_size_adjustment += static_cast<int>(param_.block_size > kBlock32x8);
+      block_size_adjustment +=
+          static_cast<int>(param_.block_size > kBlock64x16);
+    }
+    return GetDigestIdOffset() + param_.block_size - kBlock8x8 -
+           block_size_adjustment;
+  }
+
+  void Test(const char* digest, int num_runs);
+
+ private:
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  static constexpr int kStride = kMaxSuperBlockSizeInPixels;
+  static constexpr int kDestStride = kMaxSuperBlockSizeInPixels * sizeof(Pixel);
+  const MaskBlendTestParam param_ = GetParam();
+  alignas(kMaxAlignment) PredType
+      source1_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source1_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  alignas(kMaxAlignment) PredType
+      source2_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_[kMaxSuperBlockSizeInPixels *
+                        kMaxSuperBlockSizeInPixels] = {};
+  uint8_t source2_8bpp_cache_[kMaxSuperBlockSizeInPixels *
+                              kMaxSuperBlockSizeInPixels] = {};
+  uint8_t mask_[kMaxSuperBlockSizeInPixels * kMaxSuperBlockSizeInPixels];
+  uint8_t dest_[sizeof(Pixel) * kMaxSuperBlockSizeInPixels *
+                kMaxSuperBlockSizeInPixels] = {};
+  dsp::MaskBlendFunc func_;
+  dsp::InterIntraMaskBlendFunc8bpp func_8bpp_;
+};
+
+template <int bitdepth, typename Pixel>
+void MaskBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const int num_runs) {
+  if (func_ == nullptr && func_8bpp_ == nullptr) return;
+  const int width = param_.width >> param_.subsampling_x;
+  const int height = param_.height >> param_.subsampling_y;
+
+  // Add id offset to seed just to add more randomness to input blocks.
+  // If we use the same seed for different block sizes, the generated input
+  // blocks are repeated. For example, if input size is 8x8, the generated
+  // block is exactly the upper left half of the generated 16x16 block.
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed() +
+                             GetDigestIdOffset());
+  PredType* src_1 = source1_;
+  uint8_t* src_1_8bpp = source1_8bpp_;
+  PredType* src_2 = source2_;
+  uint8_t* src_2_8bpp = source2_8bpp_;
+  const ptrdiff_t src_2_stride = param_.is_inter_intra ? kStride : width;
+  uint8_t* mask_row = mask_;
+  const int range_mask = (1 << (bitdepth)) - 1;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      src_1[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      src_2[x] = static_cast<PredType>(rnd.Rand16() & range_mask);
+      if (param_.is_inter_intra && bitdepth == 8) {
+        src_1_8bpp[x] = src_1[x];
+        src_2_8bpp[x] = src_2[x];
+      }
+      if (!param_.is_inter_intra) {
+        // Implies isCompound == true.
+        constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        src_1[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        src_2[x] = static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+    src_1 += width;
+    src_1_8bpp += width;
+    src_2 += src_2_stride;
+    src_2_8bpp += src_2_stride;
+  }
+  // Mask should be setup regardless of subsampling.
+  for (int y = 0; y < param_.height; ++y) {
+    for (int x = 0; x < param_.width; ++x) {
+      mask_row[x] = rnd.Rand8() & 63;
+      mask_row[x] += rnd.Rand8() & 1;  // Range of mask is [0, 64].
+    }
+    mask_row += kStride;
+  }
+
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    const absl::Time start = absl::Now();
+    if (param_.is_inter_intra && bitdepth == 8) {
+      ASSERT_EQ(func_, nullptr);
+      static_assert(sizeof(source2_8bpp_cache_) == sizeof(source2_8bpp_), "");
+      // source2_8bpp_ is modified in the call.
+      memcpy(source2_8bpp_cache_, source2_8bpp_, sizeof(source2_8bpp_));
+      func_8bpp_(source1_8bpp_, source2_8bpp_, src_2_stride, mask_, kStride,
+                 width, height);
+      for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+          dest_[y * kDestStride + x] = source2_8bpp_[y * src_2_stride + x];
+        }
+      }
+      memcpy(source2_8bpp_, source2_8bpp_cache_, sizeof(source2_8bpp_));
+    } else {
+      if (bitdepth != 8) {
+        ASSERT_EQ(func_8bpp_, nullptr);
+      }
+      func_(source1_, source2_, src_2_stride, mask_, kStride, width, height,
+            dest_, kDestStride);
+    }
+    elapsed_time += absl::Now() - start;
+  }
+
+  test_utils::CheckMd5Digest("MaskBlend", ToString(param_.block_size), digest,
+                             dest_, sizeof(dest_), elapsed_time);
+}
+
+const MaskBlendTestParam kMaskBlendTestParam[] = {
+    // is_inter_intra = false, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 128x128.
+    MaskBlendTestParam(kBlock8x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 0, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 0, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 0, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 0, false, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock8x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock16x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x8, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock32x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x16, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x32, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock64x128, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x64, 1, 1, false, false),
+    MaskBlendTestParam(kBlock128x128, 1, 1, false, false),
+    // is_inter_intra = true, is_wedge_inter_intra = false.
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, false),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, false),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, false),
+    // is_inter_intra = true, is_wedge_inter_intra = true.
+    // block size range is from 8x8 to 32x32 (no 4:1/1:4 blocks, Section 5.11.28
+    // Read inter intra syntax).
+    MaskBlendTestParam(kBlock8x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 0, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 0, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 0, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 0, true, true),
+    MaskBlendTestParam(kBlock8x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock8x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x8, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock16x32, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x16, 1, 1, true, true),
+    MaskBlendTestParam(kBlock32x32, 1, 1, true, true),
+};
+
+using MaskBlendTest8bpp = MaskBlendTest<8, uint8_t>;
+
+TEST_P(MaskBlendTest8bpp, Blending) { Test(GetDigest8bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest8bpp, DISABLED_Speed) {
+  Test(GetDigest8bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest8bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using MaskBlendTest10bpp = MaskBlendTest<10, uint16_t>;
+
+TEST_P(MaskBlendTest10bpp, Blending) { Test(GetDigest10bpp(GetDigestId()), 1); }
+
+TEST_P(MaskBlendTest10bpp, DISABLED_Speed) {
+  Test(GetDigest10bpp(GetDigestId()), kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MaskBlendTest10bpp,
+                         testing::ValuesIn(kMaskBlendTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection.cc b/src/dsp/motion_field_projection.cc
new file mode 100644
index 0000000..7c17b8e
--- /dev/null
+++ b/src/dsp/motion_field_projection.cc
@@ -0,0 +1,116 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when MotionFieldProjectionKernel_C is
+// not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+// 7.9.2.
+void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info,
+                                   int reference_to_current_with_sign,
+                                   int dst_sign, int y8_start, int y8_end,
+                                   int x8_start, int x8_end,
+                                   TemporalMotionField* motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8);
+    int x8 = adjusted_x8_start;
+    do {
+      const int source_reference_type = source_reference_types[x8];
+      if (skip_references[source_reference_type]) continue;
+      MotionVector projection_mv;
+      // reference_to_current_with_sign could be 0.
+      GetMvProjection(mv[x8], reference_to_current_with_sign,
+                      projection_divisions[source_reference_type],
+                      &projection_mv);
+      // Do not update the motion vector if the block position is not valid or
+      // if position_x8 is outside the current range of x8_start and x8_end.
+      // Note that position_y8 will always be within the range of y8_start and
+      // y8_end.
+      const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+      if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue;
+      const int x8_base = x8 & ~7;
+      const int x8_floor =
+          std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+      const int x8_ceiling =
+          std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+      const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+      if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+      dst_mv[position_y8 * stride + position_x8] = mv[x8];
+      dst_reference_offset[position_y8 * stride + position_x8] =
+          reference_offsets[source_reference_type];
+    } while (++x8 < adjusted_x8_end);
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+
+}  // namespace
+
+void MotionFieldProjectionInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C;
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_field_projection.h b/src/dsp/motion_field_projection.h
new file mode 100644
index 0000000..36de459
--- /dev/null
+++ b/src/dsp/motion_field_projection.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_field_projection_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_field_projection_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_
diff --git a/src/dsp/motion_field_projection_test.cc b/src/dsp/motion_field_projection_test.cc
new file mode 100644
index 0000000..3a47cc7
--- /dev/null
+++ b/src/dsp/motion_field_projection_test.cc
@@ -0,0 +1,213 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/reference_info.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kMotionFieldWidth = 160;
+constexpr int kMotionFieldHight = 120;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionFieldProjectionTest : public testing::TestWithParam<int> {
+ public:
+  MotionFieldProjectionTest() = default;
+  MotionFieldProjectionTest(const MotionFieldProjectionTest&) = delete;
+  MotionFieldProjectionTest& operator=(const MotionFieldProjectionTest&) =
+      delete;
+  ~MotionFieldProjectionTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionFieldProjectionInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionFieldProjectionInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MotionFieldProjectionInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    target_motion_field_projection_kernel_func_ =
+        dsp->motion_field_projection_kernel;
+  }
+
+  void SetInputData(int motion_field_width, libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MotionFieldProjectionKernelFunc target_motion_field_projection_kernel_func_;
+  ReferenceInfo reference_info_;
+  TemporalMotionField motion_field_;
+};
+
+void MotionFieldProjectionTest::SetInputData(
+    const int motion_field_width, libvpx_test::ACMRandom* const rnd) {
+  ASSERT_TRUE(reference_info_.Reset(kMotionFieldHight, motion_field_width));
+  ASSERT_TRUE(motion_field_.mv.Reset(kMotionFieldHight, motion_field_width,
+                                     /*zero_initialize=*/false));
+  ASSERT_TRUE(motion_field_.reference_offset.Reset(kMotionFieldHight,
+                                                   motion_field_width,
+                                                   /*zero_initialize=*/false));
+  constexpr int order_hint_bits = 6;
+  unsigned int order_hint_shift_bits = Mod32(32 - order_hint_bits);
+  const unsigned int current_frame_order_hint =
+      rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+  uint8_t reference_frame_order_hint = 0;
+  reference_info_.relative_distance_to[0] = 0;
+  reference_info_.skip_references[kReferenceFrameIntra] = true;
+  reference_info_.projection_divisions[kReferenceFrameIntra] = 0;
+  for (int i = kReferenceFrameLast; i < kNumReferenceFrameTypes; ++i) {
+    reference_frame_order_hint =
+        rnd->Rand8() & ((1 << order_hint_bits) - 1);  // [0, 63]
+    const int relative_distance_to =
+        GetRelativeDistance(current_frame_order_hint,
+                            reference_frame_order_hint, order_hint_shift_bits);
+    reference_info_.relative_distance_to[i] = relative_distance_to;
+    reference_info_.skip_references[i] =
+        relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+    reference_info_.projection_divisions[i] =
+        reference_info_.skip_references[i]
+            ? 0
+            : kProjectionMvDivisionLookup[relative_distance_to];
+  }
+  for (int y = 0; y < kMotionFieldHight; ++y) {
+    for (int x = 0; x < motion_field_width; ++x) {
+      reference_info_.motion_field_reference_frame[y][x] =
+          static_cast<ReferenceFrameType>(rnd->Rand16() &
+                                          kReferenceFrameAlternate);
+      reference_info_.motion_field_mv[y][x].mv[0] = rnd->Rand16Signed() / 512;
+      reference_info_.motion_field_mv[y][x].mv[1] = rnd->Rand16Signed() / 512;
+    }
+  }
+  MotionVector invalid_mv;
+  invalid_mv.mv[0] = kInvalidMvValue;
+  invalid_mv.mv[1] = kInvalidMvValue;
+  MotionVector* const motion_field_mv = &motion_field_.mv[0][0];
+  int8_t* const motion_field_reference_offset =
+      &motion_field_.reference_offset[0][0];
+  std::fill(motion_field_mv, motion_field_mv + motion_field_.mv.size(),
+            invalid_mv);
+  std::fill(
+      motion_field_reference_offset,
+      motion_field_reference_offset + motion_field_.reference_offset.size(),
+      -128);
+}
+
+void MotionFieldProjectionTest::TestRandomValues(bool speed) {
+  static const char* const kDigestMv[8] = {
+      "87c2a74538f5c015809492ac2e521075", "ba7b4a5d82c6083b13a5b02eb7655ab7",
+      "8c37d96bf1744d5553860bf44a4f60a3", "720aa644f85e48995db9785e87cd02e3",
+      "9289c0c66524bb77a605870d78285f35", "f0326509885c2b2c89feeac53698cd47",
+      "6b9ad1d672dec825cb1803063d35badc", "dfe06c57cc9c70d27246df7fd0afa0b2"};
+  static const char* const kDigestReferenceOffset[8] = {
+      "d8d1384268d7cf5c4514b39c329f94fb", "7f30e79ceb064befbad64a20d206a540",
+      "61e2eb5644edbd3a91b939403edc891e", "7a018f1bf88193e86934241af445dc36",
+      "2d6166bf8bbe1db77baf687ecf71d028", "95fee61f0219e06076d6f0e1073b1a4e",
+      "64d0a63751267bdc573cab761f1fe685", "906a99e0e791dbcb9183c9b68ecc4ea3"};
+  const int num_tests = speed ? 2000 : 1;
+  if (target_motion_field_projection_kernel_func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int width_idx = 0; width_idx < 8; ++width_idx) {
+    const int motion_field_width = kMotionFieldWidth + width_idx;
+    SetInputData(motion_field_width, &rnd);
+    const int dst_sign = ((rnd.Rand16() & 1) != 0) ? 0 : -1;
+    const int reference_to_current_with_sign =
+        rnd.PseudoUniform(2 * kMaxFrameDistance + 1) - kMaxFrameDistance;
+    assert(std::abs(reference_to_current_with_sign) <= kMaxFrameDistance);
+    // Step of y8 and x8 is at least 16 except the last hop.
+    for (int step = 16; step <= 80; step += 16) {
+      const absl::Time start = absl::Now();
+      for (int k = 0; k < num_tests; ++k) {
+        for (int y8 = 0; y8 < kMotionFieldHight; y8 += step) {
+          const int y8_end = std::min(y8 + step, kMotionFieldHight);
+          for (int x8 = 0; x8 < motion_field_width; x8 += step) {
+            const int x8_end = std::min(x8 + step, motion_field_width);
+            target_motion_field_projection_kernel_func_(
+                reference_info_, reference_to_current_with_sign, dst_sign, y8,
+                y8_end, x8, x8_end, &motion_field_);
+          }
+        }
+      }
+      const absl::Duration elapsed_time = absl::Now() - start;
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(mv) width %d  step %d", motion_field_width, step)
+              .c_str(),
+          kDigestMv[width_idx], motion_field_.mv[0],
+          sizeof(motion_field_.mv[0][0]) * motion_field_.mv.size(),
+          elapsed_time);
+      test_utils::CheckMd5Digest(
+          "MotionFieldProjectionKernel",
+          absl::StrFormat("(ref offset) width %d  step %d", motion_field_width,
+                          step)
+              .c_str(),
+          kDigestReferenceOffset[width_idx], motion_field_.reference_offset[0],
+          sizeof(motion_field_.reference_offset[0][0]) *
+              motion_field_.reference_offset.size(),
+          elapsed_time);
+    }
+  }
+}
+
+TEST_P(MotionFieldProjectionTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionFieldProjectionTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionFieldProjectionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionFieldProjectionTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc
new file mode 100644
index 0000000..205a1b6
--- /dev/null
+++ b/src/dsp/motion_vector_search.cc
@@ -0,0 +1,187 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Silence unused function warnings when the C functions are not used.
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+void MvProjectionCompoundLowPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+          mv = (mv - (mv >> 15)) & ~1;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundForceInteger_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+        for (auto& mv : candidate_mvs[index].mv[i].mv) {
+          // The next line is equivalent to:
+          // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+          // const int sign = mv >> 15;
+          // mv = ApplySign(value, sign);
+          mv = (mv + 3 - (mv >> 15)) & ~7;
+        }
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionCompoundHighPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  int index = 0;
+  do {
+    candidate_mvs[index].mv64 = 0;
+    for (int i = 0; i < 2; ++i) {
+      // |offsets| non-zero check usually equals true and could be ignored.
+      if (offsets[i] != 0) {
+        GetMvProjection(
+            temporal_mvs[index], offsets[i],
+            kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+            &candidate_mvs[index].mv[i]);
+      }
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleLowPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleForceInteger_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+    for (auto& mv : candidate_mvs[index].mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } while (++index < count);
+}
+
+void MvProjectionSingleHighPrecision_C(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT const candidate_mvs) {
+  int index = 0;
+  do {
+    GetMvProjection(
+        temporal_mvs[index], reference_offset,
+        kProjectionMvDivisionLookup[temporal_reference_offsets[index]],
+        &candidate_mvs[index]);
+  } while (++index < count);
+}
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS ||
+        // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+
+}  // namespace
+
+void MotionVectorSearchInit_C() {
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+    !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch)
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C;
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/motion_vector_search.h b/src/dsp/motion_vector_search.h
new file mode 100644
index 0000000..ae16726
--- /dev/null
+++ b/src/dsp/motion_vector_search.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/motion_vector_search_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+// SSE4_1
+#include "src/dsp/x86/motion_vector_search_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_
diff --git a/src/dsp/motion_vector_search_test.cc b/src/dsp/motion_vector_search_test.cc
new file mode 100644
index 0000000..a7b2ec8
--- /dev/null
+++ b/src/dsp/motion_vector_search_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class MotionVectorSearchTest : public testing::TestWithParam<int>,
+                               public test_utils::MaxAlignedAllocable {
+ public:
+  MotionVectorSearchTest() = default;
+  MotionVectorSearchTest(const MotionVectorSearchTest&) = delete;
+  MotionVectorSearchTest& operator=(const MotionVectorSearchTest&) = delete;
+  ~MotionVectorSearchTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(8);
+    MotionVectorSearchInit_C();
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      MotionVectorSearchInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        MotionVectorSearchInit_SSE4_1();
+      }
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    const Dsp* const dsp = GetDspTable(8);
+    ASSERT_NE(dsp, nullptr);
+    mv_projection_compound_[0] = dsp->mv_projection_compound[0];
+    mv_projection_compound_[1] = dsp->mv_projection_compound[1];
+    mv_projection_compound_[2] = dsp->mv_projection_compound[2];
+    mv_projection_single_[0] = dsp->mv_projection_single[0];
+    mv_projection_single_[1] = dsp->mv_projection_single[1];
+    mv_projection_single_[2] = dsp->mv_projection_single[2];
+  }
+
+  void SetInputData(libvpx_test::ACMRandom* rnd);
+  void TestRandomValues(bool speed);
+
+ private:
+  MvProjectionCompoundFunc mv_projection_compound_[3];
+  MvProjectionSingleFunc mv_projection_single_[3];
+  int reference_offsets_[2];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs_[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets_[kMaxTemporalMvCandidatesWithPadding];
+  CompoundMotionVector compound_mv_org_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      CompoundMotionVector compound_mv_[kMaxTemporalMvCandidates + 1]
+                                       [kMaxTemporalMvCandidatesWithPadding];
+  MotionVector single_mv_org_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+  alignas(kMaxAlignment)
+      MotionVector single_mv_[kMaxTemporalMvCandidates + 1]
+                             [kMaxTemporalMvCandidatesWithPadding];
+};
+
+void MotionVectorSearchTest::SetInputData(libvpx_test::ACMRandom* const rnd) {
+  reference_offsets_[0] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  reference_offsets_[1] =
+      Clip3(rnd->Rand16(), -kMaxFrameDistance, kMaxFrameDistance);
+  for (int i = 0; i < kMaxTemporalMvCandidatesWithPadding; ++i) {
+    temporal_reference_offsets_[i] = rnd->RandRange(kMaxFrameDistance);
+    for (auto& mv : temporal_mvs_[i].mv) {
+      mv = rnd->Rand16Signed() / 8;
+    }
+  }
+  for (int i = 0; i <= kMaxTemporalMvCandidates; ++i) {
+    for (int j = 0; j < kMaxTemporalMvCandidatesWithPadding; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        single_mv_[i][j].mv[k] = rnd->Rand16Signed();
+        for (auto& mv : compound_mv_[i][j].mv[k].mv) {
+          mv = rnd->Rand16Signed();
+        }
+      }
+      compound_mv_org_[i][j] = compound_mv_[i][j];
+      single_mv_org_[i][j] = single_mv_[i][j];
+    }
+  }
+}
+
+void MotionVectorSearchTest::TestRandomValues(bool speed) {
+  static const char* const kDigestCompound[3] = {
+      "74c055b06c3701b2e50f2c964a6130b9", "cab21dd54f0a1bf6e80b58cdcf1fe0a9",
+      "e42de30cd84fa4e7b8581a330ed08a8b"};
+  static const char* const kDigestSingle[3] = {
+      "265ffbb59d0895183f8e2d90b6652c71", "5068d980c4ce42ed3f11963b8aece6cc",
+      "7e699d58df3954a38ff11c8e34151e66"};
+  const int num_tests = speed ? 1000000 : 1;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_compound_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = count + (count & 1);
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_compound_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_,
+            count, compound_mv_[count]);
+      }
+      // One more element could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        compound_mv_[count][i] = compound_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionCompound",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestCompound[function_index], compound_mv_, sizeof(compound_mv_),
+        elapsed_time);
+  }
+  for (int function_index = 0; function_index < 3; ++function_index) {
+    SetInputData(&rnd);
+    if (mv_projection_single_[function_index] == nullptr) continue;
+    const absl::Time start = absl::Now();
+    for (int count = 1; count <= kMaxTemporalMvCandidates; ++count) {
+      const int total_count = (count + 3) & ~3;
+      for (int i = 0; i < num_tests; ++i) {
+        mv_projection_single_[function_index](
+            temporal_mvs_, temporal_reference_offsets_, reference_offsets_[0],
+            count, single_mv_[count]);
+      }
+      // Up to three more elements could be calculated in SIMD implementations.
+      // Restore the original values if any.
+      for (int i = count; i < total_count; ++i) {
+        single_mv_[count][i] = single_mv_org_[count][i];
+      }
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    test_utils::CheckMd5Digest(
+        "MvProjectionSingle",
+        absl::StrFormat("function_index %d", function_index).c_str(),
+        kDigestSingle[function_index], single_mv_, sizeof(single_mv_),
+        elapsed_time);
+  }
+}
+
+TEST_P(MotionVectorSearchTest, Correctness) { TestRandomValues(false); }
+
+TEST_P(MotionVectorSearchTest, DISABLED_Speed) { TestRandomValues(true); }
+
+INSTANTIATE_TEST_SUITE_P(C, MotionVectorSearchTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, MotionVectorSearchTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc
new file mode 100644
index 0000000..6b5c6e3
--- /dev/null
+++ b/src/dsp/obmc.cc
@@ -0,0 +1,131 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+// 7.11.3.10 (from top samples).
+template <typename Pixel>
+void OverlapBlendVertical_C(void* LIBGAV1_RESTRICT const prediction,
+                            const ptrdiff_t prediction_stride, const int width,
+                            const int height,
+                            const void* LIBGAV1_RESTRICT const obmc_prediction,
+                            const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + height - 2;
+  assert(width >= 4);
+  assert(height >= 2);
+
+  for (int y = 0; y < height; ++y) {
+    const uint8_t mask_value = mask[y];
+    for (int x = 0; x < width; ++x) {
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+// 7.11.3.10 (from left samples).
+template <typename Pixel>
+void OverlapBlendHorizontal_C(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<Pixel*>(prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel);
+  const auto* obmc_pred = static_cast<const Pixel*>(obmc_prediction);
+  const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel);
+  const uint8_t* const mask = kObmcMask + width - 2;
+  assert(width >= 2);
+  assert(height >= 4);
+
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const uint8_t mask_value = mask[x];
+      pred[x] = static_cast<Pixel>(RightShiftWithRounding(
+          mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6));
+    }
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+  dsp->obmc_blend[kObmcDirectionHorizontal] =
+      OverlapBlendHorizontal_C<uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void ObmcInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/obmc.h b/src/dsp/obmc.h
new file mode 100644
index 0000000..3b826c7
--- /dev/null
+++ b/src/dsp/obmc.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_OBMC_H_
+#define LIBGAV1_SRC_DSP_OBMC_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/obmc_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/obmc_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend. This function is not thread-safe.
+void ObmcInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_OBMC_H_
diff --git a/src/dsp/obmc.inc b/src/dsp/obmc.inc
new file mode 100644
index 0000000..001c6ee
--- /dev/null
+++ b/src/dsp/obmc.inc
@@ -0,0 +1,32 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for overlap blend implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+// This is a flat array of masks for each block dimension from 2 to 32. The
+// starting index for each length is length-2.
+constexpr uint8_t kObmcMask[62] = {
+    // Obmc Mask 2
+    45, 64,
+    // Obmc Mask 4
+    39, 50, 59, 64,
+    // Obmc Mask 8
+    36, 42, 48, 53, 57, 61, 64, 64,
+    // Obmc Mask 16
+    34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
+    // Obmc Mask 32
+    33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
+    59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
diff --git a/src/dsp/obmc_test.cc b/src/dsp/obmc_test.cc
new file mode 100644
index 0000000..3672e12
--- /dev/null
+++ b/src/dsp/obmc_test.cc
@@ -0,0 +1,343 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kMaxBlendingBlockSize = 64;
+constexpr int kNumSpeedTests = 2e8;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "c8659acd1e8ecdab06be73f0954fa1ae", "e785f31f2723a193fefd534bd6f6c18f",
+      "751fcd8a345fef1c38a25293c9b528c0", "69af412dfa5e96ad43b79c178cb1c58b",
+      "2766a64622e183bb4614f2018f14fa85", "8d98589a5cef6e68ee8fadf19d420e3c",
+      "19eccf31dd8cf1abcee9414128fe4141", "35019f98e30bcbc6ab624682a0628519",
+      "199c551164e73c100045d7ab033ffdcc", "ad5a5eb2906265690c22741b0715f37b",
+      "e2152dea159249149ff4151111b73ed6", "1edd570bec7e63780d83588f6aacda25",
+      "b24ad192e151b1e0f74d1493004cb1b6", "6c1ce7ed3463cc60870e336f990d4f14",
+      "2e6b7a06da21512dfdd9a517d2988655", "971ba1c41ab13bb341c04f936760f546",
+      "55b803239d9f12888c666c5320450937", "3d0838963f8c95dafbfb8e5e25c865d2",
+      "98a9be6245720d4e0da18115c1a1dbd7", "7e7afe3136ad681b5ea05664fe916548",
+      "33971753243f09106173199b7bae1ef5", "65413f33c19a42c112d395121aa4b3b4",
+  };
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed8bpp(int id) {
+  static const char* const kDigest[] = {
+      "5ea519b616cd2998fbb9b25b4c2660cb", "f23d18197a96de48901738d130a147d9",
+      "07b4140c693947a63865f835089766c4", "62547d29bc4dfb2e201e9d907c09e345",
+      "c3988da521be50aeb9944564001b282b", "d5a8ff9ca1bd49f4260bb497c489b06c",
+      "b3e94f1e33c316759ebf47620327168c", "c5e64a34ca7e55f4daed19cbe4c27049",
+      "3b234eb729e8e79db8692c4cbe1b6667", "f9f3060a44c3a575470f9700b3c3a75b",
+      "e3a1960b0a7238db1184a3f9d8e9a4b2", "ba9938553703d520bc0ade427c397140",
+      "31bf64a6ed1e8002d488c0b9dcffb80a", "9ab1f3ae2e7f70cd27452f30cecfd18e",
+      "eaf25ac79ad70fc17ca96d8fcdf0f939", "9aaa88cb5e6b8757e37c3430bd664e70",
+      "8293874b2794df8fd22f5a35c3de7bee", "e9d6ee9106227c2c67ea9e6a4652e4ad",
+      "29f8a6fc2a650f3945a4ea6d3b975b6d", "8f300a257e913a42666b4921b2b0b5c5",
+      "a526265c4b3c8593736a82ddc1fd1603", "76e248f6756ac96343204b0e48d72a9e",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "6f922e4142b644ca3f1eb0f363a1c34e", "84e7c098a9335b36082fec0bc7203075",
+      "40f00ea6884fea23a3b7fae59e3b02c3", "70cb92d08b4fdb6dd9c7d418cb1455d3",
+      "ed550798b56e70439a93cb48c359e873", "55e0d927b984e78cd51a1961e58a431d",
+      "482a6856b87265a82e4ea3fdadb2d95b", "0be46226ff87d74ff2ce68a83eaf9cca",
+      "bb4461f0131a1693a0a76f21d92a480b", "ea24f78d74c7864fb247c9a98c9b97b6",
+      "d2e70b81882aeb3d9fccef89e7552a9d", "f5d882ee6d9ae6f7dfa467ca99301424",
+      "824ddb98eb4129b3d254c0bc7a64cd73", "5eaaafa8ef9b7ba5e2856a947e5b33df",
+      "071de1494e0f1b2f99266b90bdc43ddd", "c33227a96dad506adc32dacfb371ab78",
+      "e8a632f9fff240c439d4ae6e86795046", "26b90d74f18f9df4427b6180d48db1fc",
+      "e4a01e492ddc0398b5c5b60c81468242", "f1b4f7ab5c8b949e51db104f2e33565a",
+      "b1fb9ecc6a552e2b23ee92e2f3e4122a", "a683d20129a91bb20b904aa20c0499b1",
+  };
+  return kDigest[id];
+}
+
+const char* GetDigestSpeed10bpp(int id) {
+  static const char* const kDigest[] = {
+      "80557576299708005111029cef04da53", "24f84f07f53f61cd46bdcfe1e05ff9b5",
+      "4dd6bc62145baa5357a4cbf6d7a6ef15", "0b7aa27cee43b8ae0c02d07887eaa225",
+      "9e28cdae73ca97433499c31ca79e1d07", "1cacd6466a143f88e736fffaf21e2246",
+      "9c7699626660d8965e06a54282a408f3", "eef893efef62b2eb4aaad06fc462819c",
+      "4965d0a3ff750813df85c0082b21bd4b", "ec10fd79fbf552abc595def392e9a863",
+      "a148bbafdc4466fbb700b31acccca8ac", "5da9d960988549f53b817003b93e4d01",
+      "b4c4f88d1fb54869ce7ff452ca7786a6", "d607f785fce62bad85102054539e7089",
+      "b441761ea2817e4618c594aaa11d670a", "1cc5e08e6d5f9315dbc0369b97af941d",
+      "568cc1a3a67ba4e6e77f54602d0ed3e3", "522f14c068f788bc284a7d1e47d623ed",
+      "b543855cbe384b88861c881853c28192", "5faaafc124e94eedc69dc0f5d33dacac",
+      "13ca4d01bd20085459e6126555e1f7b5", "46d46fae3c8a7d9e4725154d8d2b76d8",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct ObmcTestParam {
+  ObmcTestParam(int width, int height, ObmcDirection blending_direction)
+      : width(width), height(height), blending_direction(blending_direction) {}
+  int width;
+  int height;
+  ObmcDirection blending_direction;
+};
+
+std::ostream& operator<<(std::ostream& os, const ObmcTestParam& param) {
+  return os << "BlockSize" << param.width << "x" << param.height
+            << ", blending_direction: " << ToString(param.blending_direction);
+}
+
+template <int bitdepth, typename Pixel>
+class ObmcBlendTest : public testing::TestWithParam<ObmcTestParam> {
+ public:
+  ObmcBlendTest() = default;
+  ~ObmcBlendTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    ObmcInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      if ((GetCpuInfo() & kSSE4_1) != 0) {
+        ObmcInit_SSE4_1();
+      }
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      ObmcInit_NEON();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = dsp->obmc_blend[blending_direction_];
+  }
+
+ protected:
+  int GetDigestId() const {
+    // blending_direction_ == kObmcDirectionVertical:
+    // (width, height):
+    // (4, 2), id = 0. (4, 4), id = 1. (4, 8), id = 2. (8, 4), id = 3.
+    // ...
+    // blending_direction_ == kObmcDirectionHorizontal: id starts from 11.
+    // Vertical skips (2, 4) while horizontal skips (4, 2) creating a gap after
+    // (2, 4).
+    const int id = (blending_direction_ == kObmcDirectionVertical) ? 0
+                   : (width_ == 2)                                 ? 12
+                                                                   : 11;
+    if (width_ == height_) return id + 3 * (FloorLog2(width_) - 1) - 2;
+    if (width_ < height_) return id + 3 * (FloorLog2(width_) - 1) - 1;
+    return id + 3 * (FloorLog2(height_) - 1);
+  }
+
+  // Note |digest| is only used when |use_fixed_values| is false.
+  void Test(const char* digest, bool use_fixed_values, int value);
+  void TestSpeed(const char* digest, int num_runs);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const ObmcDirection blending_direction_ = GetParam().blending_direction;
+  Pixel source1_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  Pixel source2_[kMaxBlendingBlockSize * kMaxBlendingBlockSize] = {};
+  dsp::ObmcBlendFunc func_;
+};
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::Test(const char* const digest,
+                                          const bool use_fixed_values,
+                                          const int value) {
+  if (func_ == nullptr) return;
+  if (use_fixed_values) {
+    std::fill(source1_,
+              source1_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+    std::fill(source2_,
+              source2_ + kMaxBlendingBlockSize * kMaxBlendingBlockSize, value);
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    Pixel* src_1 = source1_;
+    Pixel* src_2 = source2_;
+    const int mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        src_1[x] = rnd.Rand16() & mask;
+        src_2[x] = rnd.Rand16() & mask;
+      }
+      src_1 += kMaxBlendingBlockSize;
+      src_2 += kMaxBlendingBlockSize;
+    }
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  func_(source1_, stride, width_, height_, source2_, stride);
+  if (use_fixed_values) {
+    const bool success = test_utils::CompareBlocks(
+        source1_, source2_, width_, height_, kMaxBlendingBlockSize,
+        kMaxBlendingBlockSize, false);
+    EXPECT_TRUE(success);
+  } else {
+    test_utils::CheckMd5Digest(
+        ToString(blending_direction_),
+        absl::StrFormat("%dx%d", width_, height_).c_str(), digest, source1_,
+        sizeof(source1_), absl::Duration());
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void ObmcBlendTest<bitdepth, Pixel>::TestSpeed(const char* const digest,
+                                               const int num_runs) {
+  if (func_ == nullptr) return;
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  Pixel* src_1 = source1_;
+  Pixel* src_2 = source2_;
+  const int mask = (1 << bitdepth) - 1;
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      src_1[x] = rnd.Rand16() & mask;
+      src_2[x] = rnd.Rand16() & mask;
+    }
+    src_1 += kMaxBlendingBlockSize;
+    src_2 += kMaxBlendingBlockSize;
+  }
+  const ptrdiff_t stride = kMaxBlendingBlockSize * sizeof(Pixel);
+  uint8_t dest[sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize];
+  absl::Duration elapsed_time;
+  for (int i = 0; i < num_runs; ++i) {
+    memcpy(dest, source1_,
+           sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+    const absl::Time start = absl::Now();
+    func_(dest, stride, width_, height_, source2_, stride);
+    elapsed_time += absl::Now() - start;
+  }
+  memcpy(source1_, dest,
+         sizeof(Pixel) * kMaxBlendingBlockSize * kMaxBlendingBlockSize);
+  test_utils::CheckMd5Digest(ToString(blending_direction_),
+                             absl::StrFormat("%dx%d", width_, height_).c_str(),
+                             digest, source1_, sizeof(source1_), elapsed_time);
+}
+
+const ObmcTestParam kObmcTestParam[] = {
+    ObmcTestParam(4, 2, kObmcDirectionVertical),
+    ObmcTestParam(4, 4, kObmcDirectionVertical),
+    ObmcTestParam(4, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 4, kObmcDirectionVertical),
+    ObmcTestParam(8, 8, kObmcDirectionVertical),
+    ObmcTestParam(8, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 8, kObmcDirectionVertical),
+    ObmcTestParam(16, 16, kObmcDirectionVertical),
+    ObmcTestParam(16, 32, kObmcDirectionVertical),
+    ObmcTestParam(32, 16, kObmcDirectionVertical),
+    ObmcTestParam(32, 32, kObmcDirectionVertical),
+    ObmcTestParam(2, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(4, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 4, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(8, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 8, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(16, 32, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 16, kObmcDirectionHorizontal),
+    ObmcTestParam(32, 32, kObmcDirectionHorizontal),
+};
+
+using ObmcBlendTest8bpp = ObmcBlendTest<8, uint8_t>;
+
+TEST_P(ObmcBlendTest8bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 255);
+  Test(GetDigest8bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest8bpp, DISABLED_Speed) {
+  TestSpeed(GetDigestSpeed8bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest8bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ObmcBlendTest10bpp = ObmcBlendTest<10, uint16_t>;
+
+TEST_P(ObmcBlendTest10bpp, Blending) {
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 0);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 1);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, 128);
+  Test(/*digest=*/nullptr, /*use_fixed_values=*/true, (1 << 10) - 1);
+  Test(GetDigest10bpp(GetDigestId()), /*use_fixed_values=*/false, -1);
+}
+
+TEST_P(ObmcBlendTest10bpp, DISABLED_Speed) {
+  TestSpeed(GetDigestSpeed10bpp(GetDigestId()),
+            kNumSpeedTests / (GetParam().height * GetParam().width));
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ObmcBlendTest10bpp,
+                         testing::ValuesIn(kObmcTestParam));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/smooth_weights.inc b/src/dsp/smooth_weights.inc
new file mode 100644
index 0000000..d4ee8a6
--- /dev/null
+++ b/src/dsp/smooth_weights.inc
@@ -0,0 +1,35 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Each row below contains weights used for a corresponding block size. Because
+// they are adjacent powers of 2, the index of each row is the sum of the sizes
+// of preceding rows, minus 4.
+// The weights need to be declared as uint8_t or uint16_t, depending on the
+// bitdepth, so the values are held in a single canonical place.
+// clang-format off
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+    // clang-format on
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
new file mode 100644
index 0000000..570ba73
--- /dev/null
+++ b/src/dsp/super_res.cc
@@ -0,0 +1,110 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cassert>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int bitdepth, typename Pixel>
+void SuperRes_C(const void* /*coefficients*/,
+                void* LIBGAV1_RESTRICT const source,
+                const ptrdiff_t source_stride, const int height,
+                const int downscaled_width, const int upscaled_width,
+                const int initial_subpixel_x, const int step,
+                void* LIBGAV1_RESTRICT const dest, ptrdiff_t dest_stride) {
+  assert(step <= 1 << kSuperResScaleBits);
+  auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<Pixel*>(dest);
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                      kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    // If (original) upscaled_width is <= 9, the downscaled_width may be
+    // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+    // subsampled via RightShiftWithRounding. This leads to an edge case where
+    // |step| == 1 << 14.
+    int subpixel_x = initial_subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+      const int src_x_subpixel =
+          (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+      // The sign of each tap is: - + - + + - + -
+      sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+      sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+      sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+      sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+      sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+      sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+      sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+      sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+      dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+                     (1 << bitdepth) - 1);
+      subpixel_x += step;
+    } while (++x < upscaled_width);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+  dsp->super_res = SuperRes_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+  dsp->super_res = SuperRes_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void SuperResInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h
new file mode 100644
index 0000000..2ca9d2b
--- /dev/null
+++ b/src/dsp/super_res.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
+#define LIBGAV1_SRC_DSP_SUPER_RES_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/super_res_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_SUPER_RES_H_
diff --git a/src/dsp/super_res_test.cc b/src/dsp/super_res_test.cc
new file mode 100644
index 0000000..a93fc31
--- /dev/null
+++ b/src/dsp/super_res_test.cc
@@ -0,0 +1,264 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 5e5;
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "52eb4eac1df0c51599d57696405b69d0", "ccb07cc8295fd1440ff2e3b9199ec4f9",
+      "baef34cca795b95f3d1fd81d609da679", "03f1579c2773c8ba9c867316a22b94a3"};
+  return kDigestSuperRes[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigestSuperRes[] = {
+      "8fd78e05d944aeb11fac278b47ee60ba", "948eaecb70fa5614ce1c1c95e9942dc3",
+      "126cd7727e787e0625ec3f5ce97f8fa0", "85c806c41d40b841764bcb54f6d3a712"};
+  return kDigestSuperRes[id];
+}
+#endif
+
+struct SuperResTestParam {
+  SuperResTestParam(int downscaled_width, int upscaled_width)
+      : downscaled_width(downscaled_width), upscaled_width(upscaled_width) {}
+  int downscaled_width;
+  int upscaled_width;
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+class SuperResTest : public testing::TestWithParam<SuperResTestParam>,
+                     public test_utils::MaxAlignedAllocable {
+ public:
+  SuperResTest() = default;
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    SuperResInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const std::vector<std::string> split_test_name =
+        absl::StrSplit(test_info->name(), '/');
+    ASSERT_TRUE(absl::SimpleAtoi(split_test_name[1], &test_id_));
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      SuperResInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      SuperResInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    super_res_coefficients_ = dsp->super_res_coefficients;
+    func_ = dsp->super_res;
+  }
+
+  void TestComputeSuperRes(int fixed_value, int num_runs);
+
+ private:
+  static constexpr int kHeight = 127;
+  // The maximum width that must be allocated.
+  static constexpr int kUpscaledBufferWidth = 192;
+  // Allow room for the filter taps.
+  static constexpr int kStride =
+      ((kUpscaledBufferWidth + 2 * kSuperResHorizontalBorder + 15) & ~15);
+  const int kDownscaledWidth = GetParam().downscaled_width;
+  const int kUpscaledWidth = GetParam().upscaled_width;
+  int test_id_;
+  SuperResCoefficientsFunc super_res_coefficients_;
+  SuperResFunc func_;
+  Pixel source_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Pixel dest_buffer_[kHeight][kStride];
+  alignas(kMaxAlignment) Coefficient
+      superres_coefficients_[kSuperResFilterTaps * kUpscaledBufferWidth];
+};
+
+template <int bitdepth, typename Pixel, typename Coefficient>
+void SuperResTest<bitdepth, Pixel, Coefficient>::TestComputeSuperRes(
+    int fixed_value, int num_runs) {
+  if (func_ == nullptr) return;
+  const int superres_width = kDownscaledWidth << kSuperResScaleBits;
+  const int step = (superres_width + kUpscaledWidth / 2) / kUpscaledWidth;
+  const int error = step * kUpscaledWidth - superres_width;
+  const int initial_subpixel_x =
+      ((-((kUpscaledWidth - kDownscaledWidth) << (kSuperResScaleBits - 1)) +
+        DivideBy2(kUpscaledWidth)) /
+           kUpscaledWidth +
+       (1 << (kSuperResExtraBits - 1)) - error / 2) &
+      kSuperResScaleMask;
+  if (super_res_coefficients_ != nullptr) {
+    super_res_coefficients_(kUpscaledWidth, initial_subpixel_x, step,
+                            superres_coefficients_);
+  }
+  memset(dest_buffer_, 0, sizeof(dest_buffer_));
+  if (fixed_value != 0) {
+    SetBlock<Pixel>(kHeight, kStride, fixed_value, source_buffer_[0], kStride);
+  } else {
+    // Random values.
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kStride; ++x) {
+        source_buffer_[y][x] = rnd.Rand16() & bitdepth_mask;
+      }
+    }
+  }
+  // Offset starting point in the buffer to accommodate line extension.
+  Pixel* src_ptr = source_buffer_[0] + kSuperResHorizontalBorder;
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(superres_coefficients_, src_ptr, kStride, kHeight, kDownscaledWidth,
+          kUpscaledWidth, initial_subpixel_x, step, dest_buffer_, kStride);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+
+  if (fixed_value != 0) {
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kUpscaledWidth; ++x) {
+        EXPECT_TRUE(dest_buffer_[y][x] == fixed_value)
+            << "At location [" << y << ", " << x
+            << "]\nexpected: " << fixed_value
+            << "\nactual: " << dest_buffer_[y][x];
+      }
+    }
+  } else if (num_runs == 1) {
+    // Random values.
+    if ((kUpscaledWidth & 15) != 0) {
+      // The SIMD functions overwrite up to 15 pixels in each row. Reset them.
+      for (int y = 0; y < kHeight; ++y) {
+        for (int x = kUpscaledWidth; x < Align(kUpscaledWidth, 16); ++x) {
+          dest_buffer_[y][x] = 0;
+        }
+      }
+    }
+    const char* expected_digest;
+    if (bitdepth == 8) {
+      expected_digest = GetDigest8bpp(test_id_);
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      expected_digest = GetDigest10bpp(test_id_);
+#endif
+    }
+    test_utils::CheckMd5Digest(
+        "SuperRes",
+        absl::StrFormat("width %d, step %d, start %d", kUpscaledWidth, step,
+                        initial_subpixel_x)
+            .c_str(),
+        expected_digest, dest_buffer_, sizeof(dest_buffer_), elapsed_time);
+  } else {
+    // Speed test.
+    printf("Mode SuperRes [width %d, step %d, start %d]: %d us\n",
+           kUpscaledWidth, step, initial_subpixel_x,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+}
+
+using SuperResTest8bpp = SuperResTest<8, uint8_t, int8_t>;
+
+TEST_P(SuperResTest8bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(255, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest8bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest8bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+const SuperResTestParam kSuperResTestParams[] = {
+    SuperResTestParam(96, 192),
+    SuperResTestParam(171, 192),
+    SuperResTestParam(102, 128),
+    SuperResTestParam(61, 121),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest8bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using SuperResTest10bpp = SuperResTest<10, uint16_t, int16_t>;
+
+TEST_P(SuperResTest10bpp, FixedValues) {
+  TestComputeSuperRes(100, 1);
+  TestComputeSuperRes(511, 1);
+  TestComputeSuperRes(1, 1);
+}
+
+TEST_P(SuperResTest10bpp, RandomValues) { TestComputeSuperRes(0, 1); }
+
+TEST_P(SuperResTest10bpp, DISABLED_Speed) {
+  TestComputeSuperRes(0, kNumSpeedTests);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, SuperResTest10bpp,
+                         testing::ValuesIn(kSuperResTestParams));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc
new file mode 100644
index 0000000..dd467ea
--- /dev/null
+++ b/src/dsp/warp.cc
@@ -0,0 +1,475 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// Warp prediction output ranges from WarpTest.ShowRange.
+// Bitdepth:  8 Input range:            [       0,      255]
+//   8bpp intermediate offset: 16384.
+//   intermediate range:                [    4399,    61009]
+//   first pass output range:           [     550,     7626]
+//   8bpp intermediate offset removal: 262144.
+//   intermediate range:                [ -620566,  1072406]
+//   second pass output range:          [       0,      255]
+//   compound second pass output range: [   -4848,     8378]
+//
+// Bitdepth: 10 Input range:            [       0,     1023]
+//   intermediate range:                [  -48081,   179025]
+//   first pass output range:           [   -6010,    22378]
+//   intermediate range:                [-2103516,  4198620]
+//   second pass output range:          [       0,     1023]
+//   compound second pass output range: [    8142,    57378]
+//
+// Bitdepth: 12 Input range:            [       0,     4095]
+//   intermediate range:                [ -192465,   716625]
+//   first pass output range:           [   -6015,    22395]
+//   intermediate range:                [-2105190,  4201830]
+//   second pass output range:          [       0,     4095]
+//   compound second pass output range: [    8129,    57403]
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
+            const int source_width, const int source_height,
+            const int* LIBGAV1_RESTRICT const warp_params,
+            const int subsampling_x, const int subsampling_y,
+            const int block_start_x, const int block_start_y,
+            const int block_width, const int block_height, const int16_t alpha,
+            const int16_t beta, const int16_t gamma, const int16_t delta,
+            void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) {
+  assert(block_width >= 8 && block_height >= 8);
+  if (is_compound) {
+    assert(dest_stride == block_width);
+  }
+  constexpr int kRoundBitsHorizontal = (bitdepth == 12)
+                                           ? kInterRoundBitsHorizontal12bpp
+                                           : kInterRoundBitsHorizontal;
+  constexpr int kRoundBitsVertical =
+      is_compound        ? kInterRoundBitsCompoundVertical
+      : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
+                         : kInterRoundBitsVertical;
+
+  // Only used for 8bpp. Allows for keeping the first pass intermediates within
+  // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
+  constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
+  constexpr int offset_removal =
+      (first_pass_offset >> kRoundBitsHorizontal) * 128;
+
+  constexpr int kMaxPixel = (1 << bitdepth) - 1;
+  union {
+    // |intermediate_result| is the output of the horizontal filtering and
+    // rounding. The range is within int16_t.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+  const auto* const src = static_cast<const Pixel*>(source);
+  source_stride /= sizeof(Pixel);
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+  auto* dst = static_cast<DestType*>(dest);
+  if (!is_compound) dest_stride /= sizeof(dst[0]);
+
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+
+  // Warp process applies for each 8x8 block (or smaller).
+  for (int start_y = block_start_y; start_y < block_start_y + block_height;
+       start_y += 8) {
+    for (int start_x = block_start_x; start_x < block_start_x + block_width;
+         start_x += 8) {
+      const int src_x = (start_x + 4) << subsampling_x;
+      const int src_y = (start_y + 4) << subsampling_y;
+      const int dst_x =
+          src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+      const int dst_y =
+          src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+      const int x4 = dst_x >> subsampling_x;
+      const int y4 = dst_y >> subsampling_y;
+      const int ix4 = x4 >> kWarpedModelPrecisionBits;
+      const int iy4 = y4 >> kWarpedModelPrecisionBits;
+
+      // A prediction block may fall outside the frame's boundaries. If a
+      // prediction block is calculated using only samples outside the frame's
+      // boundary, the filtering can be simplified. We can divide the plane
+      // into several regions and handle them differently.
+      //
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //         -------+-----------+-------
+      //                |***********|
+      //            2   |*****4*****|   2
+      //                |***********|
+      //         -------+-----------+-------
+      //                |           |
+      //            1   |     3     |   1
+      //                |           |
+      //
+      // At the center, region 4 represents the frame and is the general case.
+      //
+      // In regions 1 and 2, the prediction block is outside the frame's
+      // boundary horizontally. Therefore the horizontal filtering can be
+      // simplified. Furthermore, in the region 1 (at the four corners), the
+      // prediction is outside the frame's boundary both horizontally and
+      // vertically, so we get a constant prediction block.
+      //
+      // In region 3, the prediction block is outside the frame's boundary
+      // vertically. Unfortunately because we apply the horizontal filters
+      // first, by the time we apply the vertical filters, they no longer see
+      // simple inputs. So the only simplification is that all the rows are
+      // the same, but we still need to apply all the horizontal and vertical
+      // filters.
+
+      // Check for two simple special cases, where the horizontal filter can
+      // be significantly simplified.
+      //
+      // In general, for each row, the horizontal filter is calculated as
+      // follows:
+      //   for (int x = -4; x < 4; ++x) {
+      //     const int offset = ...;
+      //     int sum = first_pass_offset;
+      //     for (int k = 0; k < 8; ++k) {
+      //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+      //       sum += kWarpedFilters[offset][k] * src_row[column];
+      //     }
+      //     ...
+      //   }
+      // The column index before clipping, ix4 + x + k - 3, varies in the range
+      // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+      // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+      // border index (source_width - 1 or 0, respectively). Then for each x,
+      // the inner for loop of the horizontal filter is reduced to multiplying
+      // the border pixel by the sum of the filter coefficients.
+      if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+        // Regions 1 and 2.
+        // Points to the left or right border of the first row of |src|.
+        const Pixel* first_row_border =
+            (ix4 + 7 <= 0) ? src : src + source_width - 1;
+        // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+        //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+        // In two special cases, iy4 + y is clipped to either 0 or
+        // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+        // bounded and we can avoid clipping iy4 + y by relying on a reference
+        // frame's boundary extension on the top and bottom.
+        if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+          // Region 1.
+          // Every sample used to calculate the prediction block has the same
+          // value. So the whole prediction block has the same value.
+          const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+          const Pixel row_border_pixel = first_row_border[row * source_stride];
+          DestType* dst_row = dst + start_x - block_start_x;
+          if (is_compound) {
+            int sum = row_border_pixel
+                      << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            Memset(dst_row, sum, 8);
+          } else {
+            Memset(dst_row, row_border_pixel, 8);
+          }
+          const DestType* const first_dst_row = dst_row;
+          dst_row += dest_stride;
+          for (int y = 1; y < 8; ++y) {
+            memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+            dst_row += dest_stride;
+          }
+          // End of region 1. Continue the |start_x| for loop.
+          continue;
+        }
+
+        // Region 2.
+        // Horizontal filter.
+        // The input values in this region are generated by extending the border
+        // which makes them identical in the horizontal direction. This
+        // computation could be inlined in the vertical pass but most
+        // implementations will need a transpose of some sort.
+        // It is not necessary to use the offset values here because the
+        // horizontal pass is a simple shift and the vertical pass will always
+        // require using 32 bits.
+        for (int y = -7; y < 8; ++y) {
+          // We may over-read up to 13 pixels above the top source row, or up
+          // to 13 pixels below the bottom source row. This is proved below.
+          const int row = iy4 + y;
+          int sum = first_row_border[row * source_stride];
+          sum <<= kFilterBits - kRoundBitsHorizontal;
+          intermediate_result_column[y + 7] = sum;
+        }
+        // Vertical filter.
+        DestType* dst_row = dst + start_x - block_start_x;
+        int sy4 =
+            (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+        for (int y = 0; y < 8; ++y) {
+          int sy = sy4 - MultiplyBy4(gamma);
+          for (int x = 0; x < 8; ++x) {
+            const int offset =
+                RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            int sum = 0;
+            for (int k = 0; k < 8; ++k) {
+              sum +=
+                  kWarpedFilters[offset][k] * intermediate_result_column[y + k];
+            }
+            sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+            if (is_compound) {
+              sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+              dst_row[x] = static_cast<DestType>(sum);
+            } else {
+              dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+            }
+            sy += gamma;
+          }
+          dst_row += dest_stride;
+          sy4 += delta;
+        }
+        // End of region 2. Continue the |start_x| for loop.
+        continue;
+      }
+
+      // Regions 3 and 4.
+      // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+      // It follows that -6 <= ix4 <= source_width + 5. This inequality is
+      // used below.
+
+      // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+      //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+      // In two special cases, iy4 + y is clipped to either 0 or
+      // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+      // bounded and we can avoid clipping iy4 + y by relying on a reference
+      // frame's boundary extension on the top and bottom.
+      if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+        // Region 3.
+        // Horizontal filter.
+        const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+        const Pixel* const src_row = src + row * source_stride;
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal);
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      } else {
+        // Region 4.
+        // Horizontal filter.
+        // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
+        // It follows that -6 <= iy4 <= source_height + 5. This inequality is
+        // used below.
+        int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+        for (int y = -7; y < 8; ++y) {
+          // We assume the source frame has top and bottom borders of at least
+          // 13 pixels that extend the frame boundary pixels.
+          //
+          // Since -7 <= y <= 7, using the inequality on iy4 above, we have
+          //   -13 <= iy4 + y <= source_height + 12,
+          // or
+          //   -13 <= row <= (source_height - 1) + 13.
+          // Therefore we may over-read up to 13 pixels above the top source
+          // row, or up to 13 pixels below the bottom source row.
+          const int row = iy4 + y;
+          const Pixel* const src_row = src + row * source_stride;
+          int sx = sx4 - MultiplyBy4(alpha);
+          for (int x = -4; x < 4; ++x) {
+            const int offset =
+                RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                kWarpedPixelPrecisionShifts;
+            // Since alpha and beta have been validated by SetupShear(), one
+            // can prove that 0 <= offset <= 3 * 2^6.
+            assert(offset >= 0);
+            assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+            // For SIMD optimization:
+            // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
+            // For 10/12 bit, the range of sum requires 32 bits.
+            int sum = first_pass_offset;
+            for (int k = 0; k < 8; ++k) {
+              // We assume the source frame has left and right borders of at
+              // least 13 pixels that extend the frame boundary pixels.
+              //
+              // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
+              // ix4 above, we have
+              //   -13 <= ix4 + x + k - 3 <= source_width + 12,
+              // or
+              //   -13 <= column <= (source_width - 1) + 13.
+              // Therefore we may over-read up to 13 pixels before the source
+              // row, or up to 13 pixels after the source row.
+              const int column = ix4 + x + k - 3;
+              sum += kWarpedFilters[offset][k] * src_row[column];
+            }
+            intermediate_result[y + 7][x + 4] =
+                RightShiftWithRounding(sum, kRoundBitsHorizontal) -
+                offset_removal;
+            sx += alpha;
+          }
+          sx4 += beta;
+        }
+      }
+
+      // Regions 3 and 4.
+      // Vertical filter.
+      DestType* dst_row = dst + start_x - block_start_x;
+      int sy4 =
+          (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+      // The spec says we should use the following loop condition:
+      //   y < std::min(4, block_start_y + block_height - start_y - 4);
+      // We can prove that block_start_y + block_height - start_y >= 8, which
+      // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
+      // So the loop condition is simply y < 4.
+      //
+      //   Proof:
+      //      start_y < block_start_y + block_height
+      //   => block_start_y + block_height - start_y > 0
+      //   => block_height - (start_y - block_start_y) > 0
+      //
+      //   Since block_height >= 8 and is a power of 2, it follows that
+      //   block_height is a multiple of 8. start_y - block_start_y is also a
+      //   multiple of 8. Therefore their difference is a multiple of 8. Since
+      //   their difference is > 0, their difference must be >= 8.
+      //
+      // We then add an offset of 4 to y so that the loop starts with y = 0
+      // and continues if y < 8.
+      for (int y = 0; y < 8; ++y) {
+        int sy = sy4 - MultiplyBy4(gamma);
+        // The spec says we should use the following loop condition:
+        //   x < std::min(4, block_start_x + block_width - start_x - 4);
+        // Similar to the above, we can prove that the loop condition can be
+        // simplified to x < 4.
+        //
+        // We then add an offset of 4 to x so that the loop starts with x = 0
+        // and continues if x < 8.
+        for (int x = 0; x < 8; ++x) {
+          const int offset =
+              RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+              kWarpedPixelPrecisionShifts;
+          // Since gamma and delta have been validated by SetupShear(), one can
+          // prove that 0 <= offset <= 3 * 2^6.
+          assert(offset >= 0);
+          assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
+          int sum = 0;
+          for (int k = 0; k < 8; ++k) {
+            sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
+          }
+          sum -= offset_removal;
+          sum = RightShiftWithRounding(sum, kRoundBitsVertical);
+          if (is_compound) {
+            sum += (bitdepth == 8) ? 0 : kCompoundOffset;
+            dst_row[x] = static_cast<DestType>(sum);
+          } else {
+            dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
+          }
+          sy += gamma;
+        }
+        dst_row += dest_stride;
+        sy4 += delta;
+      }
+    }
+    dst += 8 * dest_stride;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_Warp
+  dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WarpCompound
+  dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void WarpInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/warp.h b/src/dsp/warp.h
new file mode 100644
index 0000000..7367a9b
--- /dev/null
+++ b/src/dsp/warp.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WARP_H_
+#define LIBGAV1_SRC_DSP_WARP_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/warp_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/warp_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WARP_H_
diff --git a/src/dsp/warp_test.cc b/src/dsp/warp_test.cc
new file mode 100644
index 0000000..4d13051
--- /dev/null
+++ b/src/dsp/warp_test.cc
@@ -0,0 +1,654 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/base/macros.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/post_filter.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kSourceBorderHorizontal = 16;
+constexpr int kSourceBorderVertical = 13;
+
+constexpr int kMaxSourceBlockWidth =
+    kMaxSuperBlockSizeInPixels + kSourceBorderHorizontal * 2;
+constexpr int kMaxSourceBlockHeight =
+    kMaxSuperBlockSizeInPixels + kSourceBorderVertical * 2;
+constexpr int kMaxDestBlockWidth =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+constexpr int kMaxDestBlockHeight =
+    kMaxSuperBlockSizeInPixels + kConvolveBorderLeftTop * 2;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+template <bool is_compound>
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "77ba358a0f5e19a8e69fa0a95712578e", "141b23d13a04e0b84d26d514de76d6b0",
+      "b0265858454b979852ffadae323f0fb7", "9cf38e3579265b656f1f2100ba15b0e9",
+      "ab51d05cc255ef8e37921182df1d89b1", "e3e96f90a4b07ca733e40f057dc01c41",
+      "4eee8c1a52a62a266db9b1c9338e124c", "901a87d8f88f6324dbc0960a6de861ac",
+      "da9cb6faf6adaeeae12b6784f39186c5", "14450ab05536cdb0d2f499716ccb559d",
+      "566b396cbf008bbb869b364fdc81860d", "681a872baf2de4e58d73ea9ab8643a72",
+      "7f17d290d513a7416761b3a01f10fd2f",
+  };
+  static const char* const kCompoundDigest[] = {
+      "7e9339d265b7beac7bbe32fe7bb0fccb", "f747d663b427bb38a3ff36b0815a394c",
+      "858cf54d2253281a919fbdb48fe91c53", "4721dd97a212c6068bd488f400259afc",
+      "36878c7906492bc740112abdea77616f", "89deb68aa35764bbf3024b501a6bed50",
+      "8ac5b08f9b2afd38143c357646af0f82", "bf6e2a64835ea0c9d7467394253d0eb2",
+      "7b0a539acd2a27eff398dd084abad933", "61c8d81b397c1cf727ff8a9fabab90af",
+      "4d412349a25a832c1fb3fb29e3f0e2b3", "2c6dd2a9a4ede9fa00adb567ba646f30",
+      "b2a0ce68db3cadd207299f73112bed74",
+  };
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <bool is_compound>
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1fef54f56a0bafccf7f8da1ac3b18b76", "8a65c72f171feafa2f393d31d6b7fe1b",
+      "808019346f2f1f45f8cf2e9fc9a49320", "c28e2f2c6c830a29bcc2452166cba521",
+      "f040674d6f54e8910d655f0d11fd8cdd", "473af9bb1c6023965c2284b716feef97",
+      "e4f6d7babd0813d5afb0f575ebfa8166", "58f96ef8a880963a213624bb0d06d47c",
+      "1ec0995fa4490628b679d03683233388", "9526fb102fde7dc1a7e160e65af6da33",
+      "f0457427d0c0e31d82ea4f612f7f86f1", "ddc82ae298cccebad493ba9de0f69fbd",
+      "5ed615091e2f62df26de7e91a985cb81",
+  };
+  static const char* const kCompoundDigest[] = {
+      "8e6986ae143260e0b8b4887f15a141a1", "0a7f0db8316b8c3569f08834dd0c6f50",
+      "90705b2e7dbe083e8a1f70f29d6f257e", "e428a75bea77d769d21f3f7a1d2b0b38",
+      "a570b13d790c085c4ab50d71dd085d56", "e5d043c6cd6ff6dbab6e38a8877e93bd",
+      "12ea96991e46e3e9aa78ab812ffa0525", "84293a94a53f1cf814fa25e793c3fe27",
+      "b98a7502c84ac8437266f702dcc0a92e", "d8db5d52e9b0a5be0ad2d517d5bd16e9",
+      "f3be504bbb609ce4cc71c5539252638a", "fcde83b54e14e9de23460644f244b047",
+      "42eb66e752e9ef289b47053b5c73fdd6",
+  };
+  return is_compound ? kCompoundDigest[id] : kDigest[id];
+}
+#endif
+
+int RandomWarpedParam(int seed_offset, int bits) {
+  libvpx_test::ACMRandom rnd(seed_offset +
+                             libvpx_test::ACMRandom::DeterministicSeed());
+  // 1 in 8 chance of generating zero (arbitrary).
+  const bool zero = (rnd.Rand16() & 7) == 0;
+  if (zero) return 0;
+  // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 <<
+  // bits].
+  const int mask = (1 << bits) - 1;
+  const int value = 1 + (rnd.RandRange(1u << 31) & mask);
+  const bool sign = (rnd.Rand16() & 1) != 0;
+  return sign ? value : -value;
+}
+
+// This function is a copy from warp_prediction.cc.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// This function is a copy from warp_prediction.cc.
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(value, kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+// This function is a copy from warp_prediction.cc.
+// This function is used here to help generate valid warp parameters.
+bool SetupShear(const int* params, int16_t* alpha, int16_t* beta,
+                int16_t* gamma, int16_t* delta) {
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha0 =
+      Clip3(params[2] - (1 << kWarpedModelPrecisionBits), INT16_MIN, INT16_MAX);
+  const int beta0 = Clip3(params[3], INT16_MIN, INT16_MAX);
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma0 =
+      Clip3(RightShiftWithRoundingSigned(v * division_factor, division_shift),
+            INT16_MIN, INT16_MAX);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta0 = Clip3(
+      params[5] -
+          RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+          (1 << kWarpedModelPrecisionBits),
+      INT16_MIN, INT16_MAX);
+
+  *alpha = GetShearParameter(alpha0);
+  *beta = GetShearParameter(beta0);
+  *gamma = GetShearParameter(gamma0);
+  *delta = GetShearParameter(delta0);
+  if ((4 * std::abs(*alpha) + 7 * std::abs(*beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(*gamma) + 4 * std::abs(*delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+void GenerateWarpedModel(int* params, int16_t* alpha, int16_t* beta,
+                         int16_t* gamma, int16_t* delta, int seed) {
+  do {
+    params[0] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[1] = RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    params[2] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    params[3] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[4] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    params[5] = RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+                (1 << kWarpedModelPrecisionBits);
+    ++seed;
+  } while (params[2] == 0 || !SetupShear(params, alpha, beta, gamma, delta));
+}
+
+struct WarpTestParam {
+  WarpTestParam(int width, int height) : width(width), height(height) {}
+  int width;
+  int height;
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+class WarpTest : public testing::TestWithParam<WarpTestParam> {
+ public:
+  WarpTest() = default;
+  ~WarpTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WarpInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const absl::string_view test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WarpInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      WarpInit_SSE4_1();
+    } else {
+      FAIL() << "Unrecognized architecture prefix in test case name: "
+             << test_case;
+    }
+    func_ = is_compound ? dsp->warp_compound : dsp->warp;
+  }
+
+ protected:
+  using DestType =
+      typename std::conditional<is_compound, uint16_t, Pixel>::type;
+
+  void SetInputData(bool use_fixed_values, int value);
+  void Test(bool use_fixed_values, int value, int num_runs = 1);
+  void TestFixedValues();
+  void TestRandomValues();
+  void TestSpeed();
+
+  const WarpTestParam param_ = GetParam();
+
+ private:
+  int warp_params_[8];
+  dsp::WarpFunc func_;
+  // Warp filters are 7-tap, which needs 3 pixels (kConvolveBorderLeftTop)
+  // padding. Destination buffer indices are based on subsampling values (x+y):
+  // 0: (4:4:4), 1:(4:2:2), 2: (4:2:0).
+  Pixel source_[kMaxSourceBlockHeight * kMaxSourceBlockWidth] = {};
+  DestType dest_[3][kMaxDestBlockHeight * kMaxDestBlockWidth] = {};
+};
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::SetInputData(bool use_fixed_values,
+                                                          int value) {
+  if (use_fixed_values) {
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      Memset(source_ + row * kMaxSourceBlockWidth + kSourceBorderHorizontal,
+             value, param_.width);
+    }
+  } else {
+    const int mask = (1 << bitdepth) - 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < param_.height; ++y) {
+      const int row = kSourceBorderVertical + y;
+      for (int x = 0; x < param_.width; ++x) {
+        const int column = kSourceBorderHorizontal + x;
+        source_[row * kMaxSourceBlockWidth + column] = rnd.Rand16() & mask;
+      }
+    }
+  }
+  PostFilter::ExtendFrame<Pixel>(
+      &source_[kSourceBorderVertical * kMaxSourceBlockWidth +
+               kSourceBorderHorizontal],
+      param_.width, param_.height, kMaxSourceBlockWidth,
+      kSourceBorderHorizontal, kSourceBorderHorizontal, kSourceBorderVertical,
+      kSourceBorderVertical);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::Test(bool use_fixed_values,
+                                                  int value,
+                                                  int num_runs /*= 1*/) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value);
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int source_offset =
+      kSourceBorderVertical * kMaxSourceBlockWidth + kSourceBorderHorizontal;
+  const int dest_offset =
+      kConvolveBorderLeftTop * kMaxDestBlockWidth + kConvolveBorderLeftTop;
+  const Pixel* const src = source_ + source_offset;
+  const ptrdiff_t src_stride = kMaxSourceBlockWidth * sizeof(Pixel);
+  const ptrdiff_t dst_stride =
+      is_compound ? kMaxDestBlockWidth : kMaxDestBlockWidth * sizeof(Pixel);
+
+  absl::Duration elapsed_time;
+  for (int subsampling_x = 0; subsampling_x <= 1; ++subsampling_x) {
+    for (int subsampling_y = 0; subsampling_y <= 1; ++subsampling_y) {
+      if (subsampling_x == 0 && subsampling_y == 1) {
+        // When both are 0: 4:4:4
+        // When both are 1: 4:2:0
+        // When only |subsampling_x| is 1: 4:2:2
+        // Having only |subsampling_y| == 1 is unsupported.
+        continue;
+      }
+      int params[8];
+      int16_t alpha;
+      int16_t beta;
+      int16_t gamma;
+      int16_t delta;
+      GenerateWarpedModel(params, &alpha, &beta, &gamma, &delta, rnd.Rand8());
+
+      const int dest_id = subsampling_x + subsampling_y;
+      DestType* const dst = dest_[dest_id] + dest_offset;
+      const absl::Time start = absl::Now();
+      for (int n = 0; n < num_runs; ++n) {
+        func_(src, src_stride, param_.width, param_.height, params,
+              subsampling_x, subsampling_y, 0, 0, param_.width, param_.height,
+              alpha, beta, gamma, delta, dst, dst_stride);
+      }
+      elapsed_time += absl::Now() - start;
+    }
+  }
+
+  if (use_fixed_values) {
+    // For fixed values, input and output are identical.
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(dest_); ++i) {
+      // |is_compound| holds a few more bits of precision and an offset value.
+      Pixel compensated_dest[kMaxDestBlockWidth * kMaxDestBlockHeight];
+      const int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+      if (is_compound) {
+        for (int y = 0; y < param_.height; ++y) {
+          for (int x = 0; x < param_.width; ++x) {
+            const int compound_value =
+                dest_[i][dest_offset + y * kMaxDestBlockWidth + x];
+            const int remove_offset = compound_value - compound_offset;
+            const int full_shift =
+                remove_offset >>
+                (kInterRoundBitsVertical - kInterRoundBitsCompoundVertical);
+            compensated_dest[y * kMaxDestBlockWidth + x] =
+                Clip3(full_shift, 0, (1 << bitdepth) - 1);
+          }
+        }
+      }
+      Pixel* pixel_dest =
+          is_compound ? compensated_dest
+                      : reinterpret_cast<Pixel*>(dest_[i] + dest_offset);
+      const bool success = test_utils::CompareBlocks(
+          src, pixel_dest, param_.width, param_.height, kMaxSourceBlockWidth,
+          kMaxDestBlockWidth, false);
+      EXPECT_TRUE(success) << "subsampling_x + subsampling_y: " << i;
+    }
+  } else {
+    // (width, height):
+    // (8, 8), id = 0. (8, 16), id = 1. (16, 8), id = 2.
+    // (16, 16), id = 3. (16, 32), id = 4. (32, 16), id = 5.
+    // ...
+    // (128, 128), id = 12.
+    int id;
+    if (param_.width == param_.height) {
+      id = 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else if (param_.width < param_.height) {
+      id = 1 + 3 * static_cast<int>(FloorLog2(param_.width) - 3);
+    } else {
+      id = 2 + 3 * static_cast<int>(FloorLog2(param_.height) - 3);
+    }
+
+    const char* expected_digest;
+    if (bitdepth == 8) {
+      expected_digest = GetDigest8bpp<is_compound>(id);
+    } else {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      expected_digest = GetDigest10bpp<is_compound>(id);
+#endif
+    }
+    test_utils::CheckMd5Digest(
+        "Warp", absl::StrFormat("%dx%d", param_.width, param_.height).c_str(),
+        expected_digest, dest_, sizeof(dest_), elapsed_time);
+  }
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestFixedValues() {
+  Test(true, 0);
+  Test(true, 1);
+  Test(true, 128);
+  Test(true, (1 << bitdepth) - 1);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestRandomValues() {
+  Test(false, 0);
+}
+
+template <bool is_compound, int bitdepth, typename Pixel>
+void WarpTest<is_compound, bitdepth, Pixel>::TestSpeed() {
+  const int num_runs = static_cast<int>(1.0e7 / (param_.width * param_.height));
+  Test(false, 0, num_runs);
+}
+
+void ApplyFilterToSignedInput(const int min_input, const int max_input,
+                              const int8_t filter[kSubPixelTaps],
+                              int* min_output, int* max_output) {
+  int min = 0, max = 0;
+  for (int i = 0; i < kSubPixelTaps; ++i) {
+    const int tap = filter[i];
+    if (tap > 0) {
+      max += max_input * tap;
+      min += min_input * tap;
+    } else {
+      min += max_input * tap;
+      max += min_input * tap;
+    }
+  }
+  *min_output = min;
+  *max_output = max;
+}
+
+void ApplyFilterToUnsignedInput(const int max_input,
+                                const int8_t filter[kSubPixelTaps],
+                                int* min_output, int* max_output) {
+  ApplyFilterToSignedInput(0, max_input, filter, min_output, max_output);
+}
+
+// Validate the maximum ranges for different parts of the Warp process.
+template <int bitdepth>
+void ShowRange() {
+  constexpr int horizontal_bits = (bitdepth == kBitdepth12)
+                                      ? kInterRoundBitsHorizontal12bpp
+                                      : kInterRoundBitsHorizontal;
+  constexpr int vertical_bits = (bitdepth == kBitdepth12)
+                                    ? kInterRoundBitsVertical12bpp
+                                    : kInterRoundBitsVertical;
+  constexpr int compound_vertical_bits = kInterRoundBitsCompoundVertical;
+
+  constexpr int compound_offset = (bitdepth == 8) ? 0 : kCompoundOffset;
+
+  constexpr int max_input = (1 << bitdepth) - 1;
+
+  const int8_t* worst_warp_filter = kWarpedFilters8[93];
+
+  // First pass.
+  printf("Bitdepth: %2d Input range:            [%8d, %8d]\n", bitdepth, 0,
+         max_input);
+
+  int min = 0, max = 0;
+  ApplyFilterToUnsignedInput(max_input, worst_warp_filter, &min, &max);
+
+  int first_pass_offset;
+  if (bitdepth == 8) {
+    // Derive an offset for 8 bit.
+    for (first_pass_offset = 1; - first_pass_offset > min;
+         first_pass_offset <<= 1) {
+    }
+    printf("  8bpp intermediate offset: %d.\n", first_pass_offset);
+    min += first_pass_offset;
+    max += first_pass_offset;
+    assert(min > 0);
+    assert(max < UINT16_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  const int first_pass_min = RightShiftWithRounding(min, horizontal_bits);
+  const int first_pass_max = RightShiftWithRounding(max, horizontal_bits);
+
+  printf("  first pass output range:           [%8d, %8d]\n", first_pass_min,
+         first_pass_max);
+
+  // Second pass.
+  if (bitdepth == 8) {
+    ApplyFilterToUnsignedInput(first_pass_max, worst_warp_filter, &min, &max);
+  } else {
+    ApplyFilterToSignedInput(first_pass_min, first_pass_max, worst_warp_filter,
+                             &min, &max);
+  }
+
+  if (bitdepth == 8) {
+    // Remove the offset that was applied in the first pass since we must use
+    // int32_t for this phase anyway. 128 is the sum of the filter taps.
+    const int offset_removal = (first_pass_offset >> horizontal_bits) * 128;
+    printf("  8bpp intermediate offset removal: %d.\n", offset_removal);
+    max -= offset_removal;
+    min -= offset_removal;
+    assert(min < INT16_MIN && min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  } else {
+    // 10bpp and 12bpp require int32_t for the intermediate values. Adding an
+    // offset is not required.
+    assert(min > INT32_MIN);
+    assert(max > INT16_MAX && max < INT32_MAX);
+  }
+
+  printf("  intermediate range:                [%8d, %8d]\n", min, max);
+
+  // Second pass non-compound output is clipped to Pixel values.
+  const int second_pass_min =
+      Clip3(RightShiftWithRounding(min, vertical_bits), 0, max_input);
+  const int second_pass_max =
+      Clip3(RightShiftWithRounding(max, vertical_bits), 0, max_input);
+  printf("  second pass output range:          [%8d, %8d]\n", second_pass_min,
+         second_pass_max);
+
+  // Output is Pixel so matches Pixel values.
+  assert(second_pass_min == 0);
+  assert(second_pass_max == max_input);
+
+  const int compound_second_pass_min =
+      RightShiftWithRounding(min, compound_vertical_bits) + compound_offset;
+  const int compound_second_pass_max =
+      RightShiftWithRounding(max, compound_vertical_bits) + compound_offset;
+
+  printf("  compound second pass output range: [%8d, %8d]\n",
+         compound_second_pass_min, compound_second_pass_max);
+
+  if (bitdepth == 8) {
+    // 8bpp output is int16_t without an offset.
+    assert(compound_second_pass_min > INT16_MIN);
+    assert(compound_second_pass_max < INT16_MAX);
+  } else {
+    // 10bpp and 12bpp use the offset to fit inside uint16_t.
+    assert(compound_second_pass_min > 0);
+    assert(compound_second_pass_max < UINT16_MAX);
+  }
+
+  printf("\n");
+}
+
+TEST(WarpTest, ShowRange) {
+  ShowRange<kBitdepth8>();
+  ShowRange<kBitdepth10>();
+  ShowRange<kBitdepth12>();
+}
+
+using WarpTest8bpp = WarpTest</*is_compound=*/false, 8, uint8_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest8bpp = WarpTest</*is_compound=*/true, 8, uint8_t>;
+
+// Verifies the sum of the warped filter coefficients is 128 for every filter.
+//
+// Verifies the properties used in the calculation of ranges of variables in
+// the block warp process:
+// * The maximum sum of the positive warped filter coefficients is 175.
+// * The minimum (i.e., most negative) sum of the negative warped filter
+//   coefficients is -47.
+//
+// NOTE: This test is independent of the bitdepth and the implementation of the
+// block warp function, so it just needs to be a test in the WarpTest8bpp class
+// and does not need to be defined with TEST_P.
+TEST(WarpTest8bpp, WarpedFilterCoefficientSums) {
+  int max_positive_sum = 0;
+  int min_negative_sum = 0;
+  for (const auto& filter : kWarpedFilters) {
+    int sum = 0;
+    int positive_sum = 0;
+    int negative_sum = 0;
+    for (const auto coefficient : filter) {
+      sum += coefficient;
+      if (coefficient > 0) {
+        positive_sum += coefficient;
+      } else {
+        negative_sum += coefficient;
+      }
+    }
+    EXPECT_EQ(sum, 128);
+    max_positive_sum = std::max(positive_sum, max_positive_sum);
+    min_negative_sum = std::min(negative_sum, min_negative_sum);
+  }
+  EXPECT_EQ(max_positive_sum, 175);
+  EXPECT_EQ(min_negative_sum, -47);
+}
+
+TEST_P(WarpTest8bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest8bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest8bpp, DISABLED_Speed) { TestSpeed(); }
+const WarpTestParam warp_test_param[] = {
+    WarpTestParam(8, 8),     WarpTestParam(8, 16),   WarpTestParam(16, 8),
+    WarpTestParam(16, 16),   WarpTestParam(16, 32),  WarpTestParam(32, 16),
+    WarpTestParam(32, 32),   WarpTestParam(32, 64),  WarpTestParam(64, 32),
+    WarpTestParam(64, 64),   WarpTestParam(64, 128), WarpTestParam(128, 64),
+    WarpTestParam(128, 128),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest8bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WarpTest8bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WarpTest10bpp = WarpTest</*is_compound=*/false, 10, uint16_t>;
+// TODO(jzern): Coverage could be added for kInterRoundBitsCompoundVertical via
+// WarpCompoundTest.
+// using WarpCompoundTest10bpp = WarpTest</*is_compound=*/true, 10, uint16_t>;
+
+TEST_P(WarpTest10bpp, FixedValues) { TestFixedValues(); }
+
+TEST_P(WarpTest10bpp, RandomValues) { TestRandomValues(); }
+
+TEST_P(WarpTest10bpp, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, WarpTest10bpp, testing::ValuesIn(warp_test_param));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WarpTest10bpp,
+                         testing::ValuesIn(warp_test_param));
+#endif
+#endif
+
+std::ostream& operator<<(std::ostream& os, const WarpTestParam& warp_param) {
+  return os << "BlockSize" << warp_param.width << "x" << warp_param.height;
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc
new file mode 100644
index 0000000..41f4c70
--- /dev/null
+++ b/src/dsp/weight_mask.cc
@@ -0,0 +1,228 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int width, int height, int bitdepth, bool mask_is_inverse>
+void WeightMask_C(const void* LIBGAV1_RESTRICT prediction_0,
+                  const void* LIBGAV1_RESTRICT prediction_1,
+                  uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  const auto* pred_0 = static_cast<const PredType*>(prediction_0);
+  const auto* pred_1 = static_cast<const PredType*>(prediction_1);
+  static_assert(width >= 8, "");
+  static_assert(height >= 8, "");
+  constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      const int difference = RightShiftWithRounding(
+          std::abs(pred_0[x] - pred_1[x]), rounding_bits);
+      const auto mask_value =
+          static_cast<uint8_t>(std::min(DivideBy16(difference) + 38, 64));
+      mask[x] = mask_is_inverse ? 64 - mask_value : mask_value;
+    }
+    pred_0 += width;
+    pred_1 += width;
+    mask += mask_stride;
+  }
+}
+
+#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                           \
+      WeightMask_C<width, height, bitdepth, 0>;                     \
+  dsp->weight_mask[w_index][h_index][1] =                           \
+      WeightMask_C<width, height, bitdepth, 1>
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 8, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 8, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 8, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 8, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 8, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 8, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 8, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 8, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 8, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 8, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 8, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 8, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 8, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 8, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 8, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 8, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 8, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+  INIT_WEIGHT_MASK(8, 8, 10, 0, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+  INIT_WEIGHT_MASK(8, 16, 10, 0, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+  INIT_WEIGHT_MASK(8, 32, 10, 0, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+  INIT_WEIGHT_MASK(16, 8, 10, 1, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+  INIT_WEIGHT_MASK(16, 16, 10, 1, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+  INIT_WEIGHT_MASK(16, 32, 10, 1, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+  INIT_WEIGHT_MASK(16, 64, 10, 1, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+  INIT_WEIGHT_MASK(32, 8, 10, 2, 0);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+  INIT_WEIGHT_MASK(32, 16, 10, 2, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+  INIT_WEIGHT_MASK(32, 32, 10, 2, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+  INIT_WEIGHT_MASK(32, 64, 10, 2, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+  INIT_WEIGHT_MASK(64, 16, 10, 3, 1);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+  INIT_WEIGHT_MASK(64, 32, 10, 3, 2);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+  INIT_WEIGHT_MASK(64, 64, 10, 3, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+  INIT_WEIGHT_MASK(64, 128, 10, 3, 4);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+  INIT_WEIGHT_MASK(128, 64, 10, 4, 3);
+#endif
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+  INIT_WEIGHT_MASK(128, 128, 10, 4, 4);
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif
+
+}  // namespace
+
+void WeightMaskInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/weight_mask.h b/src/dsp/weight_mask.h
new file mode 100644
index 0000000..43bef05
--- /dev/null
+++ b/src/dsp/weight_mask.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/weight_mask_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/weight_mask_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_
diff --git a/src/dsp/weight_mask_test.cc b/src/dsp/weight_mask_test.cc
new file mode 100644
index 0000000..77b608e
--- /dev/null
+++ b/src/dsp/weight_mask_test.cc
@@ -0,0 +1,390 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/weight_mask.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kNumSpeedTests = 50000;
+constexpr int kMaxPredictionSize = 128;
+// weight_mask is only used with kCompoundPredictionTypeDiffWeighted with
+// convolve producing the most extreme ranges.
+// This includes kCompoundOffset in 10bpp and 12bpp.
+// see: src/dsp/convolve.cc & src/dsp/warp.cc.
+constexpr int kCompoundPredictionRange[3][2] = {
+    // 8bpp
+    {-5132, 9212},
+    // 10bpp
+    {3988, 61532},
+    // 12bpp
+    {3974, 61559},
+};
+
+const char* GetDigest8bpp(int id) {
+  static const char* const kDigest[] = {
+      "035267cb2ac5a0f8ff50c2d30ad52226",
+      "3231f4972dd858b734e0cc48c4cd001e",
+      "7e163b69721a13ec9f75b5cd74ffee3f",
+      "" /*kBlock4x16*/,
+      "b75e90abc224acca8754c82039b3ba93",
+      "9f555f3a2c1a933a663d6103b8118dea",
+      "8539e54f34cd6668ff6e6606210be201",
+      "20f85c9db7c878c21fbf2052936f269e",
+      "620ec166de57b0639260b2d72eebfc3e",
+      "be666394b5a894d78f4097b6cca272fe",
+      "57a96816e84cdb381f596c23827b5922",
+      "f2e0d348f608f246b6d8d799b66c189e",
+      "161ac051f38372d9339d36728b9926ba",
+      "d5fad48aaf132a81cb62bba4f07bbebb",
+      "e10be2dca2f7dae38dae75150fc1612d",
+      "7f744481eb551bbc224b5236c82cbade",
+      "0d99bbf31ecddc1c2d5063a68c0e9375",
+      "5fb8ec5f582f0ebfe519ed55860f67c4",
+
+      // mask_is_inverse = true.
+      "a4250ca39daa700836138371d36d465f",
+      "abe9a9a1c3a5accda9bfefd4d6e81ccb",
+      "e95b08878d0bb5f2293c27c3a6fe0253",
+      "" /*kBlock4x16*/,
+      "e1c52be02ce9ab2800015bb08b866c31",
+      "eea1dc73811f73866edfeb4555865f20",
+      "3178e64085645bd819256a8ab43c7b0a",
+      "ee83884e4d5cd2c9ac04879116bab681",
+      "d107eff7d5ae9ba14d2c6b3b8d9fca49",
+      "400aeea7d299626fc336c46b1ad7a9d8",
+      "e9e26a400f67f3ad36350fe4171fc613",
+      "4c31ad714f470f34127febaf1bac714b",
+      "bbdcb1097c66d561dd4ea16b3fb73f97",
+      "3a21dfbf53e4c964e303a75a3308ce15",
+      "3416dab4512fd0dc61d788b433cd624e",
+      "68ace8f01fdd74aec3fee528c8167738",
+      "9fabe05a6523da81a45150e19f75acff",
+      "7c0643e4d02421d06d7ca71822a94e1d",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigest10bpp(int id) {
+  static const char* const kDigest[] = {
+      "1dc9bdd042e5228705b857b42798e364",
+      "c054c8644bd482ce78a139d8e063e013",
+      "bbe4ac48f013f34c84779da05b0bcbe0",
+      "" /*kBlock4x16*/,
+      "13d4759277637a607f25439182553708",
+      "f089667610561a47d50f9f930ad7c454",
+      "46715e6f7819f59725bdb083f4403255",
+      "3774541c339ae3af920ef2b1d6abf6a1",
+      "94913b01d226cb5eb273dfee84b51f65",
+      "be0c0847629dfff8e0e991ed67697a7d",
+      "716b5398b77d7459274d4ea9c91ebd8e",
+      "f5c1b0b461df4182529949472242b421",
+      "5e9576ea4cf107249ce4ae89a72b9c95",
+      "da021bcdf7936f7bd9a2399c69e4d37c",
+      "b3a310a39c1900e00f992839ff188656",
+      "9f3a15351af5945615f296242ec56a38",
+      "b6e0bd03c521c5f00e90530daa7d4432",
+      "3270d7f621d488aec5b76bcf121debd0",
+
+      // mask_is_inverse = true.
+      "33df96dd246683133eefe4caea6e3f7d",
+      "73e0ccc5d42806548a4b59f856256c1e",
+      "3561a0358cf831aee9477d07feafae2d",
+      "" /*kBlock4x16*/,
+      "c5a2e633c0cd6925e68f21f47f0e2d84",
+      "8755a2d3840dde5fd6a0cce6bd6642c5",
+      "85ec538b72cecd6ea1fddab5ce3b4e64",
+      "a53e0dec84c675c4c6b1f5792b0232ff",
+      "86180da325f9727670a98cf2dbf7410e",
+      "a5fdc95104948047e179b2bc3d47f51d",
+      "9b95b3858187838e4669180e2ddb295e",
+      "6e40ca55608f6bf2f8cd91c8dbf3ddbf",
+      "d3a092672e921b588279d57e50b31888",
+      "9883eb19b733ee9f1cb6a6b6a1a00bb5",
+      "dd34764e068b228b7820321b06864e63",
+      "6c743dc9c8c87c7044151d29993e5042",
+      "44925dab01011a98b8ab1f0308fa852a",
+      "6d984b2ccfa056278e2130771127a943",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct WeightMaskTestParam {
+  WeightMaskTestParam(int width, int height, bool mask_is_inverse)
+      : width(width), height(height), mask_is_inverse(mask_is_inverse) {}
+  int width;
+  int height;
+  bool mask_is_inverse;
+};
+
+std::ostream& operator<<(std::ostream& os, const WeightMaskTestParam& param) {
+  return os << param.width << "x" << param.height
+            << ", mask_is_inverse: " << param.mask_is_inverse;
+}
+
+template <int bitdepth>
+class WeightMaskTest : public testing::TestWithParam<WeightMaskTestParam>,
+                       public test_utils::MaxAlignedAllocable {
+ public:
+  WeightMaskTest() = default;
+  ~WeightMaskTest() override = default;
+
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    WeightMaskInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp, nullptr);
+    const int width_index = FloorLog2(width_) - 3;
+    const int height_index = FloorLog2(height_) - 3;
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+      WeightMaskInit_NEON();
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      WeightMaskInit_SSE4_1();
+    }
+    func_ = dsp->weight_mask[width_index][height_index][mask_is_inverse_];
+  }
+
+ protected:
+  void SetInputData(bool use_fixed_values, int value_1, int value_2);
+  void Test(int num_runs, bool use_fixed_values, int value_1, int value_2);
+
+ private:
+  const int width_ = GetParam().width;
+  const int height_ = GetParam().height;
+  const bool mask_is_inverse_ = GetParam().mask_is_inverse;
+  using PredType =
+      typename std::conditional<bitdepth == 8, int16_t, uint16_t>::type;
+  alignas(
+      kMaxAlignment) PredType block_1_[kMaxPredictionSize * kMaxPredictionSize];
+  alignas(
+      kMaxAlignment) PredType block_2_[kMaxPredictionSize * kMaxPredictionSize];
+  uint8_t mask_[kMaxPredictionSize * kMaxPredictionSize] = {};
+  dsp::WeightMaskFunc func_;
+};
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::SetInputData(const bool use_fixed_values,
+                                            const int value_1,
+                                            const int value_2) {
+  if (use_fixed_values) {
+    std::fill(block_1_, block_1_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_1);
+    std::fill(block_2_, block_2_ + kMaxPredictionSize * kMaxPredictionSize,
+              value_2);
+  } else {
+    constexpr int bitdepth_index = (bitdepth - 8) >> 1;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        const int min_val = kCompoundPredictionRange[bitdepth_index][0];
+        const int max_val = kCompoundPredictionRange[bitdepth_index][1];
+        block_1_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+        block_2_[y * width_ + x] =
+            static_cast<PredType>(rnd(max_val - min_val) + min_val);
+      }
+    }
+  }
+}
+
+BlockSize DimensionsToBlockSize(int width, int height) {
+  if (width == 4) {
+    if (height == 4) return kBlock4x4;
+    if (height == 8) return kBlock4x8;
+    if (height == 16) return kBlock4x16;
+    return kBlockInvalid;
+  }
+  if (width == 8) {
+    if (height == 4) return kBlock8x4;
+    if (height == 8) return kBlock8x8;
+    if (height == 16) return kBlock8x16;
+    if (height == 32) return kBlock8x32;
+    return kBlockInvalid;
+  }
+  if (width == 16) {
+    if (height == 4) return kBlock16x4;
+    if (height == 8) return kBlock16x8;
+    if (height == 16) return kBlock16x16;
+    if (height == 32) return kBlock16x32;
+    if (height == 64) return kBlock16x64;
+    return kBlockInvalid;
+  }
+  if (width == 32) {
+    if (height == 8) return kBlock32x8;
+    if (height == 16) return kBlock32x16;
+    if (height == 32) return kBlock32x32;
+    if (height == 64) return kBlock32x64;
+    return kBlockInvalid;
+  }
+  if (width == 64) {
+    if (height == 16) return kBlock64x16;
+    if (height == 32) return kBlock64x32;
+    if (height == 64) return kBlock64x64;
+    if (height == 128) return kBlock64x128;
+    return kBlockInvalid;
+  }
+  if (width == 128) {
+    if (height == 64) return kBlock128x64;
+    if (height == 128) return kBlock128x128;
+    return kBlockInvalid;
+  }
+  return kBlockInvalid;
+}
+
+template <int bitdepth>
+void WeightMaskTest<bitdepth>::Test(const int num_runs,
+                                    const bool use_fixed_values,
+                                    const int value_1, const int value_2) {
+  if (func_ == nullptr) return;
+  SetInputData(use_fixed_values, value_1, value_2);
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    func_(block_1_, block_2_, mask_, kMaxPredictionSize);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (use_fixed_values) {
+    int fixed_value = (value_1 - value_2 == 0) ? 38 : 64;
+    if (mask_is_inverse_) fixed_value = 64 - fixed_value;
+    for (int y = 0; y < height_; ++y) {
+      for (int x = 0; x < width_; ++x) {
+        ASSERT_EQ(static_cast<int>(mask_[y * kMaxPredictionSize + x]),
+                  fixed_value)
+            << "x: " << x << " y: " << y;
+      }
+    }
+  } else {
+    const int id_offset = mask_is_inverse_ ? kMaxBlockSizes - 4 : 0;
+    const int id = id_offset +
+                   static_cast<int>(DimensionsToBlockSize(width_, height_)) - 4;
+    if (bitdepth == 8) {
+      test_utils::CheckMd5Digest(
+          absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+          "WeightMask", GetDigest8bpp(id), mask_, sizeof(mask_), elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    } else {
+      test_utils::CheckMd5Digest(
+          absl::StrFormat("BlockSize %dx%d", width_, height_).c_str(),
+          "WeightMask", GetDigest10bpp(id), mask_, sizeof(mask_), elapsed_time);
+#endif
+    }
+  }
+}
+
+const WeightMaskTestParam weight_mask_test_param[] = {
+    WeightMaskTestParam(8, 8, false),     WeightMaskTestParam(8, 16, false),
+    WeightMaskTestParam(8, 32, false),    WeightMaskTestParam(16, 8, false),
+    WeightMaskTestParam(16, 16, false),   WeightMaskTestParam(16, 32, false),
+    WeightMaskTestParam(16, 64, false),   WeightMaskTestParam(32, 8, false),
+    WeightMaskTestParam(32, 16, false),   WeightMaskTestParam(32, 32, false),
+    WeightMaskTestParam(32, 64, false),   WeightMaskTestParam(64, 16, false),
+    WeightMaskTestParam(64, 32, false),   WeightMaskTestParam(64, 64, false),
+    WeightMaskTestParam(64, 128, false),  WeightMaskTestParam(128, 64, false),
+    WeightMaskTestParam(128, 128, false), WeightMaskTestParam(8, 8, true),
+    WeightMaskTestParam(8, 16, true),     WeightMaskTestParam(8, 32, true),
+    WeightMaskTestParam(16, 8, true),     WeightMaskTestParam(16, 16, true),
+    WeightMaskTestParam(16, 32, true),    WeightMaskTestParam(16, 64, true),
+    WeightMaskTestParam(32, 8, true),     WeightMaskTestParam(32, 16, true),
+    WeightMaskTestParam(32, 32, true),    WeightMaskTestParam(32, 64, true),
+    WeightMaskTestParam(64, 16, true),    WeightMaskTestParam(64, 32, true),
+    WeightMaskTestParam(64, 64, true),    WeightMaskTestParam(64, 128, true),
+    WeightMaskTestParam(128, 64, true),   WeightMaskTestParam(128, 128, true),
+};
+
+using WeightMaskTest8bpp = WeightMaskTest<8>;
+
+TEST_P(WeightMaskTest8bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[0][0];
+  const int max = kCompoundPredictionRange[0][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest8bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest8bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest8bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using WeightMaskTest10bpp = WeightMaskTest<10>;
+
+TEST_P(WeightMaskTest10bpp, FixedValues) {
+  const int min = kCompoundPredictionRange[1][0];
+  const int max = kCompoundPredictionRange[1][1];
+  Test(1, true, min, min);
+  Test(1, true, min, max);
+  Test(1, true, max, min);
+  Test(1, true, max, max);
+}
+
+TEST_P(WeightMaskTest10bpp, RandomValues) { Test(1, false, -1, -1); }
+
+TEST_P(WeightMaskTest10bpp, DISABLED_Speed) {
+  Test(kNumSpeedTests, false, -1, -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, WeightMaskTest10bpp,
+                         testing::ValuesIn(weight_mask_test_param));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc
new file mode 100644
index 0000000..911c5a9
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.cc
@@ -0,0 +1,382 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/average_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline void AverageBlend4Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT dest) {
+  const __m128i pred_0 = LoadLo8(prediction_0);
+  const __m128i pred_1 = LoadLo8(prediction_1);
+  __m128i res = _mm_add_epi16(pred_0, pred_1);
+  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+  Store4(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlend8Row(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                             const int16_t* LIBGAV1_RESTRICT prediction_1,
+                             uint8_t* LIBGAV1_RESTRICT dest) {
+  const __m128i pred_0 = LoadAligned16(prediction_0);
+  const __m128i pred_1 = LoadAligned16(prediction_1);
+  __m128i res = _mm_add_epi16(pred_0, pred_1);
+  res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1);
+  StoreLo8(dest, _mm_packus_epi16(res, res));
+}
+
+inline void AverageBlendLargeRow(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                                 const int16_t* LIBGAV1_RESTRICT prediction_1,
+                                 const int width,
+                                 uint8_t* LIBGAV1_RESTRICT dest) {
+  int x = 0;
+  do {
+    const __m128i pred_00 = LoadAligned16(&prediction_0[x]);
+    const __m128i pred_01 = LoadAligned16(&prediction_1[x]);
+    __m128i res0 = _mm_add_epi16(pred_00, pred_01);
+    res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1);
+    const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]);
+    const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]);
+    __m128i res1 = _mm_add_epi16(pred_10, pred_11);
+    res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1);
+    StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1));
+    x += 16;
+  } while (x < width);
+}
+
+void AverageBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         const int width, const int height,
+                         void* LIBGAV1_RESTRICT const dest,
+                         const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = height;
+
+  if (width == 4) {
+    do {
+      // TODO(b/150326556): |prediction_[01]| values are packed. It is possible
+      // to load 8 values at a time.
+      AverageBlend4Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      AverageBlend4Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      AverageBlend8Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      AverageBlend8Row(pred_0, pred_1, dst);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* LIBGAV1_RESTRICT prediction_0,
+                            const uint16_t* LIBGAV1_RESTRICT prediction_1,
+                            const __m128i& compound_offset,
+                            const __m128i& round_offset, const __m128i& max,
+                            const __m128i& zero, uint16_t* LIBGAV1_RESTRICT dst,
+                            const ptrdiff_t dest_stride) {
+  // pred_0/1 max range is 16b.
+  const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+  const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+  const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+  const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+  const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+  const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+  const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+  const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+  const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+  const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+  // RightShiftWithRounding and Clip3.
+  const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+  const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+  const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+  const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+  if (width != 4) {
+    // Store width=8/16/32/64/128.
+    StoreUnaligned16(dst + offset, result);
+    return;
+  }
+  assert(width == 4);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              const int width, const int height,
+                              void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const __m128i compound_offset =
+      _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+  const __m128i round_offset =
+      _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+  const __m128i zero = _mm_setzero_si128();
+  int y = height;
+
+  if (width == 4) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0,1
+      AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 16) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 32) {
+    do {
+      // pred [0 - 15].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // pred [16 - 31].
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  if (width == 64) {
+    do {
+      // pred [0 - 31].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      // pred [31 - 63].
+      AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  assert(width == 128);
+  do {
+    // pred [0 - 31].
+    AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [31 - 63].
+    AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+
+    // pred [64 - 95].
+    AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [96 - 127].
+    AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void AverageBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h
new file mode 100644
index 0000000..cd07112
--- /dev/null
+++ b/src/dsp/x86/average_blend_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::average_blend. This function is not thread-safe.
+void AverageBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_AverageBlend
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/src/dsp/x86/cdef_avx2.cc b/src/dsp/x86/cdef_avx2.cc
new file mode 100644
index 0000000..01a2b9f
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.cc
@@ -0,0 +1,788 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+    420, 210, 140, 105, 420, 210, 140, 105,
+    105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_d1_temp[8];
+  const __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m256i* partial) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m256i v_src[8];
+  for (auto& i : v_src) {
+    i = _mm256_castsi128_si256(LoadLo8(src));
+    // Dup lower lane.
+    i = _mm256_permute2x128_si256(i, i, 0x0);
+    src += stride;
+  }
+
+  const __m256i v_zero = _mm256_setzero_si256();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+  // 01 11 21 33 41 51 61 71  xx xx xx xx xx xx xx xx
+  // 02 12 22 33 42 52 62 72  xx xx xx xx xx xx xx xx
+  // 03 13 23 33 43 53 63 73  xx xx xx xx xx xx xx xx
+  // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+  // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+  // 06 16 26 36 46 56 66 76  xx xx xx xx xx xx xx xx
+  // 07 17 27 37 47 57 67 77  xx xx xx xx xx xx xx xx
+  const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+  const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+  const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+  const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+  const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial[2] =
+      _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                            _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  const __m256i extend_reverse = SetrM128i(
+      _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+                    static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+      _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+                    static_cast<int>(0x80048005),
+                    static_cast<int>(0x80068007)));
+
+  for (auto& i : v_src) {
+    // Zero extend unsigned 8 to 16. The upper lane is reversed.
+    i = _mm256_shuffle_epi8(i, extend_reverse);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  xx xx xx xx xx xx xx xx
+  // 10 11 12 13 14 15 16 17  xx xx xx xx xx xx xx xx
+  // 20 21 22 23 24 25 26 27  xx xx xx xx xx xx xx xx
+  // 30 31 32 33 34 35 36 37  xx xx xx xx xx xx xx xx
+  // 40 41 42 43 44 45 46 47  xx xx xx xx xx xx xx xx
+  // 50 51 52 53 54 55 56 57  xx xx xx xx xx xx xx xx
+  // 60 61 62 63 64 65 66 67  xx xx xx xx xx xx xx xx
+  // 70 71 72 73 74 75 76 77  xx xx xx xx xx xx xx xx
+  partial[6] = v_src[0];
+  for (int i = 1; i < 8; ++i) {
+    partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+  }
+
+  AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+  AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+  AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+  a = _mm256_hadd_epi32(a, a);
+  a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+  return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+                          const __m256i partial_4,
+                          const __m256i division_table) {
+  const __m256i division_table_0 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x0);
+  const __m256i division_table_1 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+  // partial_lo
+  const __m256i a = partial_0;
+  // partial_hi
+  const __m256i b = partial_4;
+
+  // Reverse and clear upper 2 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+      static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+  // 14 13 12 11 10 09 08 ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[0] = _mm_cvtsi128_si32(sums);
+  cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+                         const __m256i partial_b,
+                         const __m256i division_table[2]) {
+  // partial_lo
+  const __m256i a = partial_a;
+  // partial_hi
+  const __m256i b = partial_b;
+
+  // Reverse and clear upper 10 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504));
+
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[index_a] = _mm_cvtsi128_si32(sums);
+  cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+                           const __m256i partial_b,
+                           const __m256i division_table) {
+  // The upper lane is a "don't care", so only use the lower lane for
+  // calculating cost.
+  const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+  const __m256i square_a = _mm256_madd_epi16(a, a);
+  const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+  const __m256i c = SumVectorPair_S32(b);
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+  cost[2] = _mm_cvtsi128_si32(sums);
+  cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
+                        ptrdiff_t stride,
+                        uint8_t* LIBGAV1_RESTRICT const direction,
+                        int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+
+  // partial[0] = add partial 0,4 low
+  // partial[1] = add partial 1,3 low
+  // partial[2] = add partial 2 low
+  // partial[3] = add partial 1,3 high
+  // partial[4] = add partial 0,4 high
+  // partial[5] = add partial 7,5 high
+  // partial[6] = add partial 6 low
+  // partial[7] = add partial 7,5 low
+  __m256i partial[8];
+
+  AddPartial(src, stride, partial);
+
+  const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+  const __m256i division_table_7 =
+      _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+  Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+  Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+  const __m256i division_table_odd[2] = {
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+  CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+  CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+                         const __m128i& damping, const __m256i& threshold) {
+  const __m256i diff = _mm256_sub_epi16(pixel, reference);
+  const __m256i abs_diff = _mm256_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m256i thresh_minus_shifted_diff =
+      _mm256_subs_epu16(threshold, shifted_diff);
+  const __m256i clamp_abs_diff =
+      _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+                                    const __m256i& tap, const __m128i& damping,
+                                    const __m256i& threshold) {
+  const __m256i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
+                     const ptrdiff_t src_stride, const int height,
+                     const int primary_strength, const int secondary_strength,
+                     const int damping, const int direction,
+                     void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+  const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+  const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+  const __m256i secondary_tap_0 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+  const __m256i secondary_tap_1 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+  const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+  const __m256i primary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+  const __m256i secondary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+  int y = height;
+  do {
+    __m128i pixel_128;
+    if (width == 8) {
+      pixel_128 = LoadUnaligned16(src);
+    } else {
+      pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+    __m256i min = pixel;
+    __m256i max = pixel;
+    __m256i sum_pair;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val_128[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val_128, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val_128, direction);
+      }
+
+      __m256i primary_val[2];
+      primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+      primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, primary_val[0]);
+        min = _mm256_min_epu16(min, primary_val[1]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+        max = _mm256_max_epu16(
+            max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+      }
+
+      sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                      primary_damping_shift, primary_threshold);
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+                               primary_damping_shift, primary_threshold));
+    } else {
+      sum_pair = _mm256_setzero_si256();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val_128[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+      }
+
+      __m256i secondary_val[4];
+      secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+      secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+      secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+      secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, secondary_val[0]);
+        min = _mm256_min_epu16(min, secondary_val[1]);
+        min = _mm256_min_epu16(min, secondary_val[2]);
+        min = _mm256_min_epu16(min, secondary_val[3]);
+
+        const __m256i max_s01 =
+            _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m256i max_s23 =
+            _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+        max = _mm256_max_epu8(max,
+                              _mm256_and_si256(max_s, cdef_large_value_mask));
+      }
+
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+
+    __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+                                _mm256_extracti128_si256(sum_pair, 1));
+
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+    if (clipping_required) {
+      const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+                                            _mm256_extracti128_si256(min, 1));
+
+      const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+                                            _mm256_extracti128_si256(max, 1));
+      // Clip3
+      sum = _mm_min_epi16(sum, max_128);
+      sum = _mm_max_epi16(sum, min_128);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_AVX2;
+
+  dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/cdef_avx2.h b/src/dsp/x86/cdef_avx2.h
new file mode 100644
index 0000000..41f2d3f
--- /dev/null
+++ b/src/dsp/x86/cdef_avx2.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc
new file mode 100644
index 0000000..6c48844
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.cc
@@ -0,0 +1,734 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
+    420, 210, 140, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
+                                            __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_d1_temp[8];
+  const __m128i v_zero = _mm_setzero_si128();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
+                                            __m128i* partial_hi) {
+  __m128i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm_setzero_si128();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
+                                      ptrdiff_t stride, __m128i* partial_lo,
+                                      __m128i* partial_hi) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m128i v_src[8];
+  for (auto& i : v_src) {
+    i = LoadLo8(src);
+    src += stride;
+  }
+
+  const __m128i v_zero = _mm_setzero_si128();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
+  const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
+  const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
+  const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
+  const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial_lo[2] =
+      _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                         _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  __m128i v_src_16[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  partial_lo[6] = v_src_16[0];
+  for (int i = 1; i < 8; ++i) {
+    partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
+
+  __m128i v_src_reverse[8];
+  const __m128i reverser =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+inline uint32_t SumVector_S32(__m128i a) {
+  a = _mm_hadd_epi32(a, a);
+  a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
+  return _mm_cvtsi128_si32(a);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
+                         const __m128i division_table[2]) {
+  // Reverse and clear upper 2 bytes.
+  const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+                                         0x03020504, 0x07060908, 0x0b0a0d0c);
+  // 14 13 12 11 10 09 08 ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+inline uint32_t CostOdd(const __m128i a, const __m128i b,
+                        const __m128i division_table[2]) {
+  // Reverse and clear upper 10 bytes.
+  const __m128i reverser =
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504);
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][10 - i])
+  const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
+  const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
+
+  const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
+  const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
+  return SumVector_S32(_mm_add_epi32(c, d));
+}
+
+// Sum of squared elements.
+inline uint32_t SquareSum_S16(const __m128i a) {
+  const __m128i square = _mm_madd_epi16(a, a);
+  return SumVector_S32(square);
+}
+
+void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
+                          ptrdiff_t stride,
+                          uint8_t* LIBGAV1_RESTRICT const direction,
+                          int* LIBGAV1_RESTRICT const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+  __m128i partial_lo[8], partial_hi[8];
+
+  AddPartial(src, stride, partial_lo, partial_hi);
+
+  cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
+  cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
+
+  const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
+                                     LoadUnaligned16(kCdefDivisionTable + 4)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const __m128i division_table_odd[2] = {
+      LoadAligned16(kCdefDivisionTableOddPadded),
+      LoadAligned16(kCdefDivisionTableOddPadded + 4)};
+
+  cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
+  cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
+  cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
+  cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
+                          const ptrdiff_t stride, __m128i* output,
+                          const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
+                    const ptrdiff_t stride, __m128i* output,
+                    const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
+                         const __m128i& damping, const __m128i& threshold) {
+  const __m128i diff = _mm_sub_epi16(pixel, reference);
+  const __m128i abs_diff = _mm_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m128i thresh_minus_shifted_diff =
+      _mm_subs_epu16(threshold, shifted_diff);
+  const __m128i clamp_abs_diff =
+      _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
+                                    const __m128i& tap, const __m128i& damping,
+                                    const __m128i& threshold) {
+  const __m128i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride, const int height,
+                       const int primary_strength, const int secondary_strength,
+                       const int damping, const int direction,
+                       void* LIBGAV1_RESTRICT dest,
+                       const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+
+  const __m128i primary_tap_0 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
+  const __m128i primary_tap_1 =
+      _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
+  const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
+  const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
+  const __m128i cdef_large_value_mask =
+      _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
+  const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
+  const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
+
+  int y = height;
+  do {
+    __m128i pixel;
+    if (width == 8) {
+      pixel = LoadUnaligned16(src);
+    } else {
+      pixel = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m128i min = pixel;
+    __m128i max = pixel;
+    __m128i sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, primary_val[0]);
+        min = _mm_min_epu16(min, primary_val[1]);
+        min = _mm_min_epu16(min, primary_val[2]);
+        min = _mm_min_epu16(min, primary_val[3]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
+        const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
+        const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
+        max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
+      }
+
+      sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                 primary_damping_shift, primary_threshold);
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+      sum = _mm_add_epi16(
+          sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
+                                    primary_damping_shift, primary_threshold));
+    } else {
+      sum = _mm_setzero_si128();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = _mm_min_epu16(min, secondary_val[0]);
+        min = _mm_min_epu16(min, secondary_val[1]);
+        min = _mm_min_epu16(min, secondary_val[2]);
+        min = _mm_min_epu16(min, secondary_val[3]);
+        min = _mm_min_epu16(min, secondary_val[4]);
+        min = _mm_min_epu16(min, secondary_val[5]);
+        min = _mm_min_epu16(min, secondary_val[6]);
+        min = _mm_min_epu16(min, secondary_val[7]);
+
+        const __m128i max_s01 =
+            _mm_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m128i max_s23 =
+            _mm_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m128i max_s45 =
+            _mm_max_epu8(secondary_val[4], secondary_val[5]);
+        const __m128i max_s67 =
+            _mm_max_epu8(secondary_val[6], secondary_val[7]);
+        const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
+                                           _mm_max_epu8(max_s45, max_s67));
+        max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
+      }
+
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum = _mm_add_epi16(
+          sum,
+          ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, pixel);
+    if (clipping_required) {
+      // Clip3
+      sum = _mm_min_epi16(sum, max);
+      sum = _mm_max_epi16(sum, min);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_SSE4_1;
+  dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h
new file mode 100644
index 0000000..6631eb7
--- /dev/null
+++ b/src/dsp/x86/cdef_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..373116a
--- /dev/null
+++ b/src/dsp/x86/common_avx2.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace libgav1 {
+namespace dsp {
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_AVX2
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/src/dsp/x86/common_avx2.inc b/src/dsp/x86/common_avx2.inc
new file mode 100644
index 0000000..53b4e2e
--- /dev/null
+++ b/src/dsp/x86/common_avx2.inc
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+  // For compatibility with older gcc toolchains (< 8) use
+  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+  // are implemented similarly to the following, clang uses a different method
+  // but no differences in assembly have been observed.
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m256i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+  if (over_read_in_bytes > 0) {
+    __m128i m = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+      m = _mm_srli_si128(m, 1);
+    }
+    const __m256i mask = (over_read_in_bytes < 16)
+                             ? SetrM128i(_mm_set1_epi8(-1), m)
+                             : SetrM128i(m, _mm_setzero_si128());
+    dst = _mm256_and_si256(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+                              const ptrdiff_t over_read_in_bytes,
+                              __m256i dst[2]) {
+  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+                         over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m256i v_bias_d =
+      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+  return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+  return _mm256_srai_epi32(v_tmp_d, bits);
+}
diff --git a/src/dsp/x86/common_avx2_test.cc b/src/dsp/x86/common_avx2_test.cc
new file mode 100644
index 0000000..2062683
--- /dev/null
+++ b/src/dsp/x86/common_avx2_test.cc
@@ -0,0 +1,67 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_avx2.h"
+
+#include "gtest/gtest.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+TEST(CommonDspTest, AVX2RightShiftWithRoundingS16) {
+  for (int bits = 0; bits < 16; ++bits) {
+    const int bias = (1 << bits) >> 1;
+    for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+      const __m256i v_val_d = _mm256_set1_epi16(value);
+      const __m256i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+      // Note _mm256_extract_epi16 is avoided for compatibility with Visual
+      // Studio < 2017.
+      const int16_t result =
+          _mm_extract_epi16(_mm256_extracti128_si256(v_result_d, 0), 0);
+      const int32_t expected = RightShiftWithRounding(value, bits);
+      if (value <= INT16_MAX - bias) {
+        EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+      } else {
+        EXPECT_EQ(expected, 1 << (15 - bits));
+        EXPECT_EQ(result, -expected)
+            << "value: " << value << ", bits: " << bits;
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_AVX2
+
+TEST(CommonDspTest, AVX2) {
+  GTEST_SKIP() << "Build this module for x86(-64) with AVX2 enabled to enable "
+                  "the tests.";
+}
+
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h
new file mode 100644
index 0000000..41a3a68
--- /dev/null
+++ b/src/dsp/x86/common_sse4.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if 0
+#include <cinttypes>
+#include <cstdio>
+
+// Quite useful macro for debugging. Left here for convenience.
+inline void PrintReg(const __m128i r, const char* const name, int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  fprintf(stderr, "%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n)
+      fprintf(stderr, "%.16" PRIx64 " ", static_cast<uint64_t>(tmp.i64[n]));
+  }
+  fprintf(stderr, "\n");
+}
+
+inline void PrintReg(const int r, const char* const name) {
+  fprintf(stderr, "%s: %d\n", name, r);
+}
+
+inline void PrintRegX(const int r, const char* const name) {
+  fprintf(stderr, "%s: %.8x\n", name, r);
+}
+
+#define PR(var, N) PrintReg(var, #var, N)
+#define PD(var) PrintReg(var, #var);
+#define PX(var) PrintRegX(var, #var);
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  fprintf(stderr, "Shadow for %s:\n", name);
+  __msan_print_shadow(r, size);
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif  // LIBGAV1_MSAN
+
+#endif  // 0
+
+namespace libgav1 {
+namespace dsp {
+namespace sse4 {
+
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace sse4
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/src/dsp/x86/common_sse4.inc b/src/dsp/x86/common_sse4.inc
new file mode 100644
index 0000000..35c56b8
--- /dev/null
+++ b/src/dsp/x86/common_sse4.inc
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+  int16_t val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+  uint16_t val1;
+  uint16_t val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  int16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val1, val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+  const __m128 x =
+      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+  return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+  assert(bits < 16);
+  const __m128i v_bias_d =
+      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+                                                  int bits) {
+  const __m128i v_bias_d =
+      _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+  static constexpr uint8_t kMask[32] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  };
+
+  return LoadUnaligned16(kMask + n);
+}
diff --git a/src/dsp/x86/common_sse4_test.cc b/src/dsp/x86/common_sse4_test.cc
new file mode 100644
index 0000000..4ea811a
--- /dev/null
+++ b/src/dsp/x86/common_sse4_test.cc
@@ -0,0 +1,64 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/common_sse4.h"
+
+#include "gtest/gtest.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Show that RightShiftWithRounding_S16() is equal to
+// RightShiftWithRounding() only for values less than or equal to
+// INT16_MAX - ((1 << bits) >> 1). In particular, if bits == 16, then
+// RightShiftWithRounding_S16() is equal to RightShiftWithRounding() only for
+// negative values.
+TEST(CommonDspTest, SSE4RightShiftWithRoundingS16) {
+  for (int bits = 0; bits < 16; ++bits) {
+    const int bias = (1 << bits) >> 1;
+    for (int32_t value = INT16_MIN; value <= INT16_MAX; ++value) {
+      const __m128i v_val_d = _mm_set1_epi16(value);
+      const __m128i v_result_d = RightShiftWithRounding_S16(v_val_d, bits);
+      const int16_t result = _mm_extract_epi16(v_result_d, 0);
+      const int32_t expected = RightShiftWithRounding(value, bits);
+      if (value <= INT16_MAX - bias) {
+        EXPECT_EQ(result, expected) << "value: " << value << ", bits: " << bits;
+      } else {
+        EXPECT_EQ(expected, 1 << (15 - bits));
+        EXPECT_EQ(result, -expected)
+            << "value: " << value << ", bits: " << bits;
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+TEST(CommonDspTest, SSE4) {
+  GTEST_SKIP() << "Build this module for x86(-64) with SSE4 enabled to enable "
+                  "the tests.";
+}
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..4126ca9
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.cc
@@ -0,0 +1,1549 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+  __m256i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm256_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+    const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+                          const __m256i* const v_tap) {
+  __m256i v_src[4];
+  const __m256i src_long = *src;
+  const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+  const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+                             const __m256i* const v_tap) {
+  __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m256i HorizontalTaps8To16(const __m256i* const src,
+                            const __m256i* const v_tap) {
+  const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int /*width*/,
+                      const int height, const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // Horizontal passes only need to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (!is_compound) {
+      int y = height;
+      if (is_2d) y -= 1;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y -= 2;
+      } while (y != 0);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m256i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  if (width >= 32) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          // Load into 2 128 bit lanes.
+          const __m256i src_long =
+              SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+          const __m256i result =
+              HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+          const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+                                              LoadUnaligned16(&src[x + 24]));
+          const __m256i result2 =
+              HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+          if (is_2d) {
+            StoreAligned32(&dest16[x], result);
+            StoreAligned32(&dest16[x + 16], result2);
+          } else {
+            StoreUnaligned32(&dest16[x], result);
+            StoreUnaligned32(&dest16[x + 16], result2);
+          }
+        } else {
+          // Load src used to calculate dest8[7:0] and dest8[23:16].
+          const __m256i src_long = LoadUnaligned32(&src[x]);
+          const __m256i result =
+              SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+          // Load src used to calculate dest8[15:8] and dest8[31:24].
+          const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+          const __m256i result2 =
+              SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+          // Combine results and store.
+          StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+        }
+        x += 32;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else if (width == 16) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      if (is_2d || is_compound) {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long =
+            SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 =
+            SetrM128i(LoadUnaligned16(&src[src_stride]),
+                      LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+        if (is_2d) {
+          StoreAligned32(&dest16[0], result);
+          StoreAligned32(&dest16[pred_stride], result2);
+        } else {
+          StoreUnaligned32(&dest16[0], result);
+          StoreUnaligned32(&dest16[pred_stride], result2);
+        }
+      } else {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+                                           LoadUnaligned16(&src[src_stride]));
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 = SetrM128i(
+            LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+        const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+        StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+        StoreUnaligned16(&dest8[pred_stride],
+                         _mm256_extracti128_si256(packed_result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long =
+          SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned32(&dest16[0], result);
+    }
+
+  } else if (width == 8) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        if (is_2d) {
+          StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreAligned16(&dest16[pred_stride],
+                         _mm256_extracti128_si256(result, 1));
+        } else {
+          StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreUnaligned16(&dest16[pred_stride],
+                           _mm256_extracti128_si256(result, 1));
+        }
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+    }
+
+  } else {  // width == 4
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        Store4(&dest8[0], _mm256_castsi256_si128(result));
+        Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m256i* v_tap) {
+  if (num_taps == 8) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(*filter);                      // k1k0
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));   // k3k2
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));   // k5k4
+      v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12));  // k7k6
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+      v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
+    }
+  } else if (num_taps == 6) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2));   // k2k1
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));   // k4k3
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10));  // k6k5
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
+    }
+  } else if (num_taps == 4) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));  // k3k2
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));  // k5k4
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+    }
+  } else {  // num_taps == 2
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));  // k4k3
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+                                const __m256i* const taps) {
+  __m256i sum_lo =
+      _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m256i sum_hi =
+      _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m256i madd_lo =
+        _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m256i madd_hi =
+        _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo =
+          _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi =
+          _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo =
+            _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi =
+            _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm256_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm256_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
+                          void* LIBGAV1_RESTRICT const dst,
+                          const ptrdiff_t dst_stride, const int width,
+                          const int height, const __m256i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m256i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned32(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i packed_sum = _mm_packus_epi16(
+            _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        StoreUnaligned16(dst8_x, packed_sum);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 16;
+  } while (x < width);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m256i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* LIBGAV1_RESTRICT prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  if (width > 2) {
+    DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+                                     width, width, intermediate_height,
+                                     horizontal_filter_id, horiz_filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH</*is_2d=*/true>(
+        src, src_stride, intermediate_result, width, width, intermediate_height,
+        horizontal_filter_id, horiz_filter_index);
+  }
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+  __m256i v_src[4];
+
+  if (!unpack_high) {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+    }
+  } else {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+    }
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int width,
+                        const int height, const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 32);
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m256i srcs[8];
+    srcs[0] = LoadUnaligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m256i sums_hi =
+          SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+      if (is_compound) {
+        const __m256i results =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+        const __m256i results_hi =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+        StoreUnaligned32(dst16_x, results);
+        StoreUnaligned32(dst16_x + 16, results_hi);
+        dst16_x += dst_stride;
+      } else {
+        const __m256i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m256i results_hi =
+            RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+        const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+        StoreUnaligned32(dst8_x, packed_results);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 32;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
+                        const ptrdiff_t src_stride,
+                        void* LIBGAV1_RESTRICT const dst,
+                        const ptrdiff_t dst_stride, const int /*width*/,
+                        const int height, const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums_hi =
+        SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+      const __m256i results_hi =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+      StoreUnaligned32(dst16, results);
+      StoreUnaligned32(dst16 + dst_stride, results_hi);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i results_hi =
+          RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreUnaligned16(dst8, this_dst);
+      StoreUnaligned16(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results = Compound1DShift(sums);
+      const __m128i this_dst = _mm256_castsi256_si128(results);
+      const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+      StoreUnaligned16(dst16, this_dst);
+      StoreUnaligned16(dst16 + dst_stride, next_dst);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreLo8(dst8, this_dst);
+      StoreLo8(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
+                       const ptrdiff_t src_stride,
+                       void* LIBGAV1_RESTRICT const dst,
+                       const ptrdiff_t dst_stride, const int /*width*/,
+                       const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m128i srcs[8];
+  srcs[0] = LoadLo8(src_x);
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = LoadLo8(src_x);
+    src_x += src_stride;
+    srcs[2] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[4] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[6] = LoadLo8(src_x);
+        src_x += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadLo8(src_x);
+    src_x += src_stride;
+
+    const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m128i results = Compound1DShift(sums);
+      StoreUnaligned16(dst16, results);
+      dst16 += dst_stride;
+    } else {
+      const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      StoreLo8(dst8, _mm_packus_epi16(results, results));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[1];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[2];
+      srcs[2] = srcs[3];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[4];
+        srcs[4] = srcs[5];
+        if (num_taps == 8) {
+          srcs[5] = srcs[6];
+          srcs[6] = srcs[7];
+        }
+      }
+    }
+  } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
+                           const ptrdiff_t reference_stride,
+                           const int /*horizontal_filter_index*/,
+                           const int vertical_filter_index,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* LIBGAV1_RESTRICT prediction,
+                           const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    }
+  } else {  // width <= 8
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    }
+  }
+}
+
+void ConvolveCompoundVertical_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = width;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    }
+  } else {  // width <= 4
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width > 2) {
+    DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                     horizontal_filter_id, filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+                        horizontal_filter_id, filter_index);
+  }
+}
+
+void ConvolveCompoundHorizontal_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+  // Quiet compiler error.
+  (void)pred_stride;
+#endif
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_AVX2(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<8, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<6, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<4, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<2, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+  dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..e509bc9
--- /dev/null
+++ b/src/dsp/x86/convolve_avx2.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
new file mode 100644
index 0000000..f7e5a71
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -0,0 +1,1923 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+template <int filter_index>
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+                          const __m128i* const v_tap) {
+  __m128i v_src[4];
+  const __m128i src_long = LoadUnaligned16(src);
+  const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
+  const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
+                             const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
+                            const __m128i* const v_tap) {
+  const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                      const ptrdiff_t src_stride,
+                      void* LIBGAV1_RESTRICT const dest,
+                      const ptrdiff_t pred_stride, const int width,
+                      const int height, const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum =
+              HorizontalTaps8To16<filter_index>(&src[x], v_tap);
+          if (is_2d) {
+            StoreAligned16(&dest16[x], v_sum);
+          } else {
+            StoreUnaligned16(&dest16[x], v_sum);
+          }
+        } else {
+          const __m128i result =
+              SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
+          StoreLo8(&dest8[x], result);
+        }
+        x += 8;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+    return;
+  }
+
+  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(width <= 4);
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (width == 4) {
+      int y = height;
+      do {
+        if (is_2d || is_compound) {
+          const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
+          StoreLo8(dest16, v_sum);
+        } else {
+          const __m128i result = SimpleHorizontalTaps<filter_index>(src, v_tap);
+          Store4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (--y != 0);
+      return;
+    }
+
+    if (!is_compound) {
+      int y = height;
+      if (is_2d) y -= 1;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y -= 2;
+      } while (y != 0);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+    void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+    const int width, const int height, const int filter_id,
+    const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+                       const ptrdiff_t reference_stride,
+                       const int horizontal_filter_index,
+                       const int vertical_filter_index,
+                       const int horizontal_filter_id,
+                       const int vertical_filter_id, const int width,
+                       const int height, void* LIBGAV1_RESTRICT prediction,
+                       const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  }
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                    const ptrdiff_t src_stride,
+                    void* LIBGAV1_RESTRICT const dst,
+                    const ptrdiff_t dst_stride, const int width,
+                    const int height, const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
+
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m128i srcs[8];
+    srcs[0] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadLo8(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadLo8(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadLo8(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadLo8(src_x);
+      src_x += src_stride;
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16_x, results);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+void ConvolveVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+    if (width == 2) {
+      FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else {
+    // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases.
+    // See convolve_neon.cc
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 2) {
+      FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  }
+}
+
+void ConvolveCompoundCopy_SSE4(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const ptrdiff_t src_stride = reference_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  constexpr int kRoundBitsVertical =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+  if (width >= 16) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&src[x]);
+        const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
+        const __m128i v_src_ext_hi =
+            _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
+        const __m128i v_dest_lo =
+            _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
+        const __m128i v_dest_hi =
+            _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
+        // TODO(slavarnway): Investigate using aligned stores.
+        StoreUnaligned16(&dest[x], v_dest_lo);
+        StoreUnaligned16(&dest[x + 8], v_dest_hi);
+        x += 16;
+      } while (x < width);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i v_src = LoadLo8(&src[0]);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreUnaligned16(&dest[0], v_dest);
+      src += src_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else { /* width == 4 */
+    int y = height;
+    do {
+      const __m128i v_src0 = Load4(&src[0]);
+      const __m128i v_src1 = Load4(&src[src_stride]);
+      const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
+      const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
+      const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
+      StoreLo8(&dest[0], v_dest);
+      StoreHi8(&dest[pred_stride], v_dest);
+      src += src_stride * 2;
+      dest += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void ConvolveCompoundVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int vertical_filter_index, const int /*horizontal_filter_id*/,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  if (filter_index < 2) {  // 6 tap.
+    SetupTaps<6>(&v_filter, taps);
+    if (width == 4) {
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    SetupTaps<2>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else {
+    SetupTaps<4>(&v_filter, taps);
+
+    if (width == 4) {
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   horizontal_filter_id, filter_index);
+}
+
+void ConvolveCompoundHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int /*vertical_filter_index*/, const int horizontal_filter_id,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint16_t*>(prediction);
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int horizontal_filter_index,
+    const int vertical_filter_index, const int horizontal_filter_id,
+    const int vertical_filter_id, const int width, const int height,
+    void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  // Similarly for height.
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height = height + vertical_taps - 1;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
+
+  const ptrdiff_t dest_stride = width;
+  __m128i taps[4];
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  if (vertical_taps == 8) {
+    SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 6) {
+    SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+// Pre-transposed filters.
+template <int filter_index>
+inline void GetHalfSubPixelFilter(__m128i* output) {
+  // Filter 0
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
+      {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+       {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
+       {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  // Filter 1
+  alignas(16) static constexpr int8_t
+      kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
+          {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
+          {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+  // Filter 2
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
+      {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
+       {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+       {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
+       {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+       {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+       {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
+       {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+       {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
+  // Filter 3
+  alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
+  // Filter 4
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
+      {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
+       {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+       {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+       {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
+  // Filter 5
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
+  switch (filter_index) {
+    case 0:
+      output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
+      break;
+    case 1:
+      // The term "mixed" refers to the fact that the outer taps have a mix of
+      // negative and positive values.
+      output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
+      break;
+    case 2:
+      output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
+      output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
+      output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
+      output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
+      output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
+      break;
+    case 3:
+      output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
+      break;
+    case 4:
+      output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
+      output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
+      output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
+      output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
+      break;
+    default:
+      assert(filter_index == 5);
+      output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
+      output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
+      output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
+      output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
+      break;
+  }
+}
+
+// There are many opportunities for overreading in scaled convolve, because
+// the range of starting points for filter windows is anywhere from 0 to 16
+// for 8 destination pixels, and the window sizes range from 2 to 8. To
+// accommodate this range concisely, we use |grade_x| to mean the most steps
+// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
+// More importantly, |grade_x| answers the question "how many vector loads are
+// needed to cover the source values?"
+// When |grade_x| == 1, the maximum number of source values needed is 8 separate
+// starting positions plus 7 more to cover taps, all fitting into 16 bytes.
+// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
+// every 8 |step_x| increments, on top of 8 possible taps. The first load covers
+// the starting sources for each kernel, while the final load covers the taps.
+// Since the offset value of src_x cannot exceed 8 and |num_taps| does not
+// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
+// |step_x|.
+template <int num_taps, int grade_x>
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+                                 const __m128i src_indices,
+                                 __m128i* const source /*[num_taps >> 1]*/) {
+  // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+  // msan because it incorrectly models the outcome of the shuffles in some
+  // cases. This has not been reproduced out of context.
+  const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+  const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
+  source[0] = _mm_shuffle_epi8(src_vals, src_indices);
+  if (grade_x == 1) {
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
+    }
+    if (num_taps > 4) {
+      source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
+    }
+  } else {
+    assert(grade_x > 1);
+    assert(num_taps != 4);
+    // grade_x > 1 also means width >= 8 && num_taps != 4
+    const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
+    if (num_taps > 2) {
+      source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
+                                   src_indices);
+      source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
+                                   src_indices);
+    }
+    if (num_taps > 6) {
+      source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
+                                   src_indices);
+    }
+  }
+}
+
+template <int num_taps>
+inline void PrepareHorizontalTaps(const __m128i subpel_indices,
+                                  const __m128i* filter_taps,
+                                  __m128i* out_taps) {
+  const __m128i scale_index_offsets =
+      _mm_srli_epi16(subpel_indices, kFilterIndexShift);
+  const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
+  const __m128i filter_indices =
+      _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
+                    filter_index_mask);
+  // Line up taps for maddubs_epi16.
+  // The unpack is also assumed to be lighter than shift+alignr.
+  for (int k = 0; k < (num_taps >> 1); ++k) {
+    const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
+    const __m128i taps1 =
+        _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
+    out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
+  }
+}
+
+inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
+  const __m128i src_indices16 =
+      _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
+  const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
+  return _mm_unpacklo_epi8(src_indices,
+                           _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
+}
+
+template <int grade_x, int filter_index, int num_taps>
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+                                    ptrdiff_t src_stride, int width,
+                                    int subpixel_x, int step_x,
+                                    int intermediate_height,
+                                    int16_t* LIBGAV1_RESTRICT intermediate) {
+  // Account for the 0-taps that precede the 2 nonzero taps.
+  const int kernel_offset = (8 - num_taps) >> 1;
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  __m128i filter_taps[num_taps];
+  GetHalfSubPixelFilter<filter_index>(filter_taps);
+  const __m128i index_steps =
+      _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
+                      _mm_set1_epi16(static_cast<int16_t>(step_x)));
+
+  __m128i taps[num_taps >> 1];
+  __m128i source[num_taps >> 1];
+  int p = subpixel_x;
+  // Case when width <= 4 is possible.
+  if (filter_index >= 3) {
+    if (filter_index > 3 || width <= 4) {
+      const uint8_t* src_x =
+          &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+      // Only add steps to the 10-bit truncated p to avoid overflow.
+      const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+      const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+      PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+      const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+      int y = intermediate_height;
+      do {
+        // Load and line up source values with the taps. Width 4 means no need
+        // to load extended source.
+        PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
+                                                      source);
+
+        StoreLo8(intermediate, RightShiftWithRounding_S16(
+                                   SumOnePassTaps<filter_index>(source, taps),
+                                   kInterRoundBitsHorizontal - 1));
+        src_x += src_stride;
+        intermediate += kIntermediateStride;
+      } while (--y != 0);
+      return;
+    }
+  }
+
+  // |width| >= 8
+  int16_t* intermediate_x = intermediate;
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const __m128i p_fraction = _mm_set1_epi16(p & 1023);
+    const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
+    PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
+    const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
+
+    int y = intermediate_height;
+    do {
+      // For each x, a lane of src_k[k] contains src_x[k].
+      PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
+
+      // Shift by one less because the taps are halved.
+      StoreAligned16(
+          intermediate_x,
+          RightShiftWithRounding_S16(SumOnePassTaps<filter_index>(source, taps),
+                                     kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (--y != 0);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+template <int num_taps>
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+                                __m128i* output) {
+  // Avoid overreading the filter due to starting at kernel_offset.
+  // The only danger of overread is in the final filter, which has 4 taps.
+  const __m128i filter =
+      _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
+  output[0] = _mm_shuffle_epi32(filter, 0);
+  if (num_taps > 2) {
+    output[1] = _mm_shuffle_epi32(filter, 0x55);
+  }
+  if (num_taps > 4) {
+    output[2] = _mm_shuffle_epi32(filter, 0xAA);
+  }
+  if (num_taps > 6) {
+    output[3] = _mm_shuffle_epi32(filter, 0xFF);
+  }
+}
+
+// Process eight 16 bit inputs and output eight 16 bit values.
+template <int num_taps, bool is_compound>
+inline __m128i Sum2DVerticalTaps(const __m128i* const src,
+                                 const __m128i* taps) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
+  }
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// Bottom half of each src[k] is the source for one filter, and the top half
+// is the source for the other filter, for the next destination row.
+template <int num_taps, bool is_compound>
+__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
+                             const __m128i* taps_hi) {
+  const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
+  __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
+  const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
+  __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
+  if (num_taps > 2) {
+    const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
+    const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
+  }
+  if (num_taps > 4) {
+    const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
+    const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
+  }
+  if (num_taps > 6) {
+    const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
+    sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
+    const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
+    sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+// |width_class| is 2, 4, or 8, according to the Store function that should be
+// used.
+template <int num_taps, int width_class, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+                                  const int intermediate_height,
+                                  const int width, const int subpixel_y,
+                                  const int filter_index, const int step_y,
+                                  const int height, void* LIBGAV1_RESTRICT dest,
+                                  const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  constexpr int kernel_offset = (8 - num_taps) / 2;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  auto* dest16_y = static_cast<uint16_t*>(dest);
+  auto* dest_y = static_cast<uint8_t*>(dest);
+  __m128i s[num_taps];
+
+  int p = subpixel_y & 1023;
+  int y = height;
+  if (width_class <= 4) {
+    __m128i filter_taps_lo[num_taps >> 1];
+    __m128i filter_taps_hi[num_taps >> 1];
+    do {  // y > 0
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadLo8(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter0 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadHi8(s[i], src_y + i * src_stride);
+      }
+      filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter1 =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
+      p += step_y;
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+
+      const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
+          s, filter_taps_lo, filter_taps_hi);
+      if (is_compound) {
+        assert(width_class > 2);
+        StoreLo8(dest16_y, sums);
+        dest16_y += dest_stride;
+        StoreHi8(dest16_y, sums);
+        dest16_y += dest_stride;
+      } else {
+        const __m128i result = _mm_packus_epi16(sums, sums);
+        if (width_class == 2) {
+          Store2(dest_y, result);
+          dest_y += dest_stride;
+          Store2(dest_y, _mm_srli_si128(result, 4));
+        } else {
+          Store4(dest_y, result);
+          dest_y += dest_stride;
+          Store4(dest_y, _mm_srli_si128(result, 4));
+        }
+        dest_y += dest_stride;
+      }
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  // |width_class| >= 8
+  __m128i filter_taps[num_taps >> 1];
+  int x = 0;
+  do {  // x < width
+    auto* dest_y = static_cast<uint8_t*>(dest) + x;
+    auto* dest16_y = static_cast<uint16_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int y = height;
+    do {  // y > 0
+      const int filter_id = (p >> 6) & kSubPixelMask;
+      const int8_t* filter =
+          kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+      PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+      src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = LoadUnaligned16(src_y + i * src_stride);
+      }
+
+      const __m128i sums =
+          Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
+      if (is_compound) {
+        StoreUnaligned16(dest16_y, sums);
+      } else {
+        StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
+      }
+      p += step_y;
+      dest_y += dest_stride;
+      dest16_y += dest_stride;
+    } while (--y != 0);
+    src += kIntermediateStride * intermediate_height;
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
+                            const ptrdiff_t reference_stride,
+                            const int horizontal_filter_index,
+                            const int vertical_filter_index,
+                            const int subpixel_x, const int subpixel_y,
+                            const int step_x, const int step_y, const int width,
+                            const int height, void* LIBGAV1_RESTRICT prediction,
+                            const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  // The output of the horizontal filter, i.e. the intermediate_result, is
+  // guaranteed to fit in int16_t.
+  alignas(16) int16_t
+      intermediate_result[kIntermediateAllocWidth *
+                          (2 * kIntermediateAllocWidth + kSubPixelTaps)];
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
+  const int intermediate_height =
+      (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+       kScaleSubPixelBits) +
+      num_vert_taps;
+
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [3, 5].
+  // Similarly for height.
+  int16_t* intermediate = intermediate_result;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // second register and alignr in order to gather all filter inputs.
+  // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
+  switch (horiz_filter_index) {
+    case 0:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 1:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+
+      } else {
+        ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 2:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 3:
+      if (step_x > grade_x_threshold) {
+        ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      } else {
+        ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
+                                         step_x, intermediate_height,
+                                         intermediate);
+      }
+      break;
+    case 4:
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+      break;
+    default:
+      assert(horiz_filter_index == 5);
+      assert(width <= 4);
+      ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
+                                       step_x, intermediate_height,
+                                       intermediate);
+  }
+
+  // Vertical filter.
+  intermediate = intermediate_result;
+  switch (vert_filter_index) {
+    case 0:
+    case 1:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<6, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<6, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<6, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    case 2:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<8, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<8, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<8, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    case 3:
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<2, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<2, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<2, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+      break;
+    default:
+      assert(vert_filter_index == 4 || vert_filter_index == 5);
+      if (!is_compound && width == 2) {
+        ConvolveVerticalScale<4, 2, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else if (width == 4) {
+        ConvolveVerticalScale<4, 4, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      } else {
+        ConvolveVerticalScale<4, 8, is_compound>(
+            intermediate, intermediate_height, width, subpixel_y,
+            vert_filter_index, step_y, height, prediction, pred_stride);
+      }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                              uint8_t* LIBGAV1_RESTRICT dst) {
+  const __m128i left = LoadUnaligned16(src);
+  const __m128i right = LoadUnaligned16(src + 1);
+  StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+                                     const ptrdiff_t src_stride,
+                                     const int height,
+                                     uint8_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+    const int /*subpixel_y*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i left = LoadLo8(src);
+      const __m128i right = LoadLo8(src + 1);
+      StoreLo8(dest, _mm_avg_epu8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 4) {
+    int y = height;
+    do {
+      __m128i left = Load4(src);
+      __m128i right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    assert(width == 2);
+    __m128i left = _mm_setzero_si128();
+    __m128i right = _mm_setzero_si128();
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 2));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  __m128i row[8], below[8];
+
+  row[0] = LoadUnaligned16(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = LoadUnaligned16(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = LoadUnaligned16(src);
+      src += 16;
+      row[3] = LoadUnaligned16(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = LoadUnaligned16(src);
+        src += 16;
+        row[5] = LoadUnaligned16(src);
+        src += 16;
+        row[6] = LoadUnaligned16(src);
+        src += 16;
+        row[7] = LoadUnaligned16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = LoadUnaligned16(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = LoadUnaligned16(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = LoadUnaligned16(src);
+        src += 16;
+        below[3] = LoadUnaligned16(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = LoadUnaligned16(src);
+          src += 16;
+          below[5] = LoadUnaligned16(src);
+          src += 16;
+          below[6] = LoadUnaligned16(src);
+          src += 16;
+          below[7] = LoadUnaligned16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    __m128i row, below;
+    row = LoadLo8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = LoadLo8(src);
+      src += reference_stride;
+
+      StoreLo8(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else if (width == 4) {
+    __m128i row = Load4(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      __m128i below = Load4(src);
+      src += reference_stride;
+
+      Store4(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {
+    assert(width == 2);
+    __m128i row = Load2(src);
+    __m128i below = _mm_setzero_si128();
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+                                const uint8_t* LIBGAV1_RESTRICT src1) {
+  const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+  const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+  return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+  const __m128i a = _mm_add_epi16(v0, v1);
+  const __m128i b = _mm_srli_epi16(a, 1);
+  // Use avg here to shift right by 1 with round.
+  const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+  return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+                             const ptrdiff_t src_stride, const int height,
+                             uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  __m128i row[16];
+  row[0] = LoadU8AndAddLong(src, src + 1);
+  if (width >= 16) {
+    src += 8;
+    row[1] = LoadU8AndAddLong(src, src + 1);
+    if (width >= 32) {
+      src += 8;
+      row[2] = LoadU8AndAddLong(src, src + 1);
+      src += 8;
+      row[3] = LoadU8AndAddLong(src, src + 1);
+      if (width >= 64) {
+        src += 8;
+        row[4] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[5] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[6] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[7] = LoadU8AndAddLong(src, src + 1);
+        if (width == 128) {
+          src += 8;
+          row[8] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[9] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[10] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[11] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[12] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[13] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[14] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[15] = LoadU8AndAddLong(src, src + 1);
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+    StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+      StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+    const void* LIBGAV1_RESTRICT const reference,
+    const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+    const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+    const int /*vertical_filter_id*/, const int width, const int height,
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    __m128i left = _mm_cvtepu8_epi16(Load4(src));
+    __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+    src += reference_stride;
+
+    __m128i row = _mm_add_epi16(left, right);
+
+    int y = height;
+    do {
+      left = Load4(src);
+      right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  } else {
+    __m128i left = Load2(src);
+    __m128i right = Load2(src + 1);
+    src += reference_stride;
+
+    __m128i row =
+        _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
+  dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
+
+  dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4;
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h
new file mode 100644
index 0000000..d6c3155
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/src/dsp/x86/convolve_sse4.inc b/src/dsp/x86/convolve_sse4.inc
new file mode 100644
index 0000000..550d6a4
--- /dev/null
+++ b/src/dsp/x86/convolve_sse4.inc
@@ -0,0 +1,934 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+    const __m128i v_src_43 = _mm_shuffle_epi8(
+        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+  const __m128i v_src_32 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+  const __m128i v_src_54 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+                           static_cast<int>(0x80070706), 0x06050504));
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
+// 2D version.
+template <int num_taps, int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = height;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = height;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = height;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int num_taps, int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = height;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = height;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  }
+}
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc
new file mode 100644
index 0000000..c813df4
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -0,0 +1,461 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/distance_weighted_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weights) {
+  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+  const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1);
+  const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights);
+  const __m128i result_lo =
+      RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4);
+
+  const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1);
+  const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights);
+  const __m128i result_hi =
+      RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4);
+
+  return _mm_packs_epi32(result_lo, result_hi);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  for (int y = 0; y < height; y += 4) {
+    // TODO(b/150326556): Use larger loads.
+    const __m128i src_00 = LoadLo8(pred_0);
+    const __m128i src_10 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    __m128i src_0 = LoadHi8(src_00, pred_0);
+    __m128i src_1 = LoadHi8(src_10, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+    const __m128i src_01 = LoadLo8(pred_0);
+    const __m128i src_11 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    src_0 = LoadHi8(src_01, pred_0);
+    src_1 = LoadHi8(src_11, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    Store4(dst, result_pixels);
+    dst += dest_stride;
+    const int result_1 = _mm_extract_epi32(result_pixels, 1);
+    memcpy(dst, &result_1, sizeof(result_1));
+    dst += dest_stride;
+    const int result_2 = _mm_extract_epi32(result_pixels, 2);
+    memcpy(dst, &result_2, sizeof(result_2));
+    dst += dest_stride;
+    const int result_3 = _mm_extract_epi32(result_pixels, 3);
+    memcpy(dst, &result_3, sizeof(result_3));
+    dst += dest_stride;
+  }
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+
+    const __m128i result_pixels = _mm_packus_epi16(res0, res1);
+    StoreLo8(dst, result_pixels);
+    dst += dest_stride;
+    StoreHi8(dst, result_pixels);
+    dst += dest_stride;
+  }
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const int16_t* LIBGAV1_RESTRICT pred_0,
+    const int16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16));
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weights);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weights);
+
+      StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi));
+      x += 16;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dest_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dest_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dest_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dest_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dest_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dest_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dest_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dest_stride);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weight0,
+                                       const __m128i& weight1) {
+  // This offset is a combination of round_factor and round_offset
+  // which are to be added and subtracted respectively.
+  // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+  constexpr int offset =
+      (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bias = _mm_set1_epi32(offset);
+  const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+  __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+  __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+  __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+  __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+  __m128i sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+  prediction0 = _mm_unpackhi_epi16(pred0, zero);
+  mult0 = _mm_mullo_epi32(prediction0, weight0);
+  prediction1 = _mm_unpackhi_epi16(pred1, zero);
+  mult1 = _mm_mullo_epi32(prediction1, weight1);
+  sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+  const __m128i pack = _mm_packus_epi32(result0, result1);
+
+  return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadLo8(pred_0);
+    const __m128i src_10 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    __m128i src_0 = LoadHi8(src_00, pred_0);
+    __m128i src_1 = LoadHi8(src_10, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    const __m128i src_01 = LoadLo8(pred_0);
+    const __m128i src_11 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    src_0 = LoadHi8(src_01, pred_0);
+    src_1 = LoadHi8(src_11, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    StoreLo8(dst, res0);
+    dst += dest_stride;
+    StoreHi8(dst, res0);
+    dst += dest_stride;
+    StoreLo8(dst, res1);
+    dst += dest_stride;
+    StoreHi8(dst, res1);
+    dst += dest_stride;
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* LIBGAV1_RESTRICT const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+    StoreUnaligned16(dst, res0);
+    dst += dest_stride;
+    StoreUnaligned16(dst, res1);
+    dst += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height,
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+      StoreUnaligned16(dst + x, res_lo);
+      x += 8;
+      StoreUnaligned16(dst + x, res_hi);
+      x += 8;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height,
+                                  void* LIBGAV1_RESTRICT const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dst_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dst_stride);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void DistanceWeightedBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h
new file mode 100644
index 0000000..dbb9f88
--- /dev/null
+++ b/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::distance_weighted_blend. This function is not thread-safe.
+void DistanceWeightedBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
new file mode 100644
index 0000000..9ece947
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -0,0 +1,494 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+  return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+  return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+  return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+  StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+  StoreUnaligned16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16(luma);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+  }
+  return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(
+            LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+            LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+        1);
+  }
+  return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+                     const __m128i high) {
+  const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+  return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(const int16_t* scaling_lut,
+                                 const Pixel* source) {
+  alignas(16) int16_t start_vals[8];
+  static_assert(bitdepth <= kBitdepth10,
+                "SSE4 Film Grain is not yet implemented for 12bpp.");
+  for (int i = 0; i < 8; ++i) {
+    assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+    start_vals[i] = scaling_lut[source[i]];
+  }
+  return LoadAligned16(start_vals);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+                          const __m128i scaling_shift) {
+  const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+  return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+    int scaling_shift, int width, int height, int start_height,
+    const int16_t* scaling_lut_y, const void* source_plane_y,
+    ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_luma);
+  const int safe_width = width & ~7;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_width; x += 8) {
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+
+    if (x < width) {
+      Pixel luma_buffer[8];
+      // Prevent arbitrary indices from entering GetScalingFactors.
+      memset(luma_buffer, 0, sizeof(luma_buffer));
+      const int valid_range = width - x;
+      memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+  out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+    const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+    const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+    const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+    const __m128i scaling_shift) {
+  const __m128i scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const __m128i orig = LoadSource(chroma_cursor);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
+    const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+    Pixel* out_chroma_row, ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+  alignas(16) Pixel luma_buffer[16];
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+  // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    // This section only runs if width % (8 << sub_x) != 0. It should never run
+    // on 720p and above.
+    if (x < chroma_width) {
+      // Prevent huge indices from entering GetScalingFactors due to
+      // uninitialized values. This is not a problem in 8bpp because the table
+      // is made larger than 255 values.
+      if (bitdepth > kBitdepth8) {
+        memset(luma_buffer, 0, sizeof(luma_buffer));
+      }
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+    const int16_t* scaling_lut, const __m128i& orig,
+    const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+    const __m128i& average_luma, const __m128i& scaling_shift,
+    const __m128i& offset, const __m128i& weights) {
+  uint8_t merged_buffer[8];
+  const __m128i combined_lo =
+      _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+  const __m128i combined_hi =
+      _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+  const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+                                              _mm_srai_epi32((combined_hi), 6));
+
+  const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+  StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+  const __m128i scaling =
+      GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
+    const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+    const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+    uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+  // will need to be guarded from overread, even if |chroma_width| is a
+  // multiple of 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+  alignas(16) uint8_t luma_buffer[16];
+  const __m128i offset = _mm_set1_epi16(chroma_offset);
+  const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+                                             (luma_multiplier & 0xFFFF));
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      // There is no need to pre-initialize this buffer, because merged values
+      // used as indices are saturated in the 8bpp case. Uninitialized values
+      // are written outside the frame.
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const int valid_range_chroma = chroma_width - x;
+      uint8_t chroma_buffer[8];
+      memcpy(chroma_buffer, &in_chroma_row[x],
+             valid_range_chroma * sizeof(in_chroma_row[0]));
+
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      const __m128i orig_chroma =
+          LoadSourceMsan(chroma_buffer, valid_range_chroma);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+    Plane plane, const FilmGrainParams& params,
+    const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, const int16_t* scaling_lut,
+    const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_SSE4_1(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/film_grain_sse4.h b/src/dsp/x86/film_grain_sse4.h
new file mode 100644
index 0000000..1cacbac
--- /dev/null
+++ b/src/dsp/x86/film_grain_sse4.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc
new file mode 100644
index 0000000..967be06
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.cc
@@ -0,0 +1,273 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intra_edge.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kKernelTaps = 5;
+constexpr int kKernels[3][kKernelTaps] = {
+    {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}};
+constexpr int kMaxEdgeBufferSize = 129;
+
+// This function applies the kernel [0, 4, 8, 4, 0] to 12 values.
+// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to
+// write as overlapping sets of 8-bytes.
+inline void ComputeKernel1Store12(uint8_t* LIBGAV1_RESTRICT dest,
+                                  const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  // Samples matched with the '4' tap, expanded to 16-bit.
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  // Samples matched with the '8' tap, expanded to 16-bit.
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+
+  // Apply the taps by shifting.
+  const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2);
+  const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2);
+  const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3);
+  const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3);
+  // Move latter 4x values down to add with first 4x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4));
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_lo, centers8_lo), 4);
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(partial_sums_hi, centers8_hi), 4);
+
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [0, 5, 6, 5, 0] to 12 values.
+// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will
+// be overwritten or safely discarded.
+inline void ComputeKernel2Store12(uint8_t* LIBGAV1_RESTRICT dest,
+                                  const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 6);
+  const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo);
+  const __m128i centers_lo = _mm_srli_si128(outers_lo, 2);
+  const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi);
+  const __m128i centers_hi = _mm_srli_si128(outers_hi, 2);
+  // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x.
+  const __m128i outers5_lo =
+      _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2));
+  const __m128i outers5_hi =
+      _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2));
+  // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x.
+  const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1),
+                                            _mm_slli_epi16(centers_lo, 2));
+  const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1),
+                                            _mm_slli_epi16(centers_hi, 2));
+  // Move latter 5x values down to add with first 5x values for each output.
+  const __m128i partial_sums_lo =
+      _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_lo = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_lo, partial_sums_lo), 4);
+  // Shift latter 5x values to add with first 5x values for each output.
+  const __m128i partial_sums_hi =
+      _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4));
+  // Move 6x values down to add for the final kernel sum for each output.
+  const __m128i sums_hi = RightShiftWithRounding_U16(
+      _mm_add_epi16(centers6_hi, partial_sums_hi), 4);
+  // First 6 values are valid outputs.
+  const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo);
+  const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi);
+  const __m128i result =
+      _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10);
+  StoreUnaligned16(dest, result);
+}
+
+// This function applies the kernel [2, 4, 4, 4, 2] to 8 values.
+inline void ComputeKernel3Store8(uint8_t* LIBGAV1_RESTRICT dest,
+                                 const uint8_t* LIBGAV1_RESTRICT source) {
+  const __m128i edge_lo = LoadUnaligned16(source);
+  const __m128i edge_hi = _mm_srli_si128(edge_lo, 4);
+  // Finish |edge_lo| life cycle quickly.
+  // Multiply for 2x.
+  const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2);
+  // Finish |source2| life cycle quickly.
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8));
+  // First 4x values already aligned to add with running total.
+  sum = _mm_add_epi16(sum, source4_lo);
+  // Move second 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2));
+  // Move third 4x values down to add with running total.
+  sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4));
+  // Multiply for 2x.
+  const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1);
+  // Multiply 2x by 2 and align.
+  const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2);
+  // Move latter 2x values down to add with first 2x values for each output.
+  __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8));
+  // First 4x values already aligned to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, source4_hi);
+  // Move second 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2));
+  // Move third 4x values down to add with running total.
+  sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4));
+
+  // Because we have only 8 values here, it is safe to align before packing down
+  // to 8-bit without losing data.
+  sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8);
+  sum = RightShiftWithRounding_U16(sum, 4);
+  StoreLo8(dest, _mm_packus_epi16(sum, sum));
+}
+
+void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) {
+  uint8_t edge[kMaxEdgeBufferSize + 4];
+  memcpy(edge, buffer, size);
+  auto* dst_buffer = static_cast<uint8_t*>(buffer);
+
+  // Only process |size| - 1 elements. Nothing to do in this case.
+  if (size == 1) return;
+
+  int i = 0;
+  switch (strength) {
+    case 1:
+      // To avoid overwriting, we stop short from the total write size plus the
+      // initial offset. In this case 12 valid values are written in two blocks
+      // of 8 bytes each.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel1Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    case 2:
+      // See the comment for case 1.
+      for (; i < size - 17; i += 12) {
+        ComputeKernel2Store12(dst_buffer + i + 1, edge + i);
+      }
+      break;
+    default:
+      assert(strength == 3);
+      // The first filter input is repeated for taps of value 2 and 4.
+      dst_buffer[1] = RightShiftWithRounding(
+          (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4);
+      // In this case, one block of 8 bytes is written in each iteration, with
+      // an offset of 2.
+      for (; i < size - 10; i += 8) {
+        ComputeKernel3Store8(dst_buffer + i + 2, edge + i);
+      }
+  }
+  const int kernel_index = strength - 1;
+  for (int final_index = Clip3(i, 1, size - 2); final_index < size;
+       ++final_index) {
+    int sum = 0;
+    for (int j = 0; j < kKernelTaps; ++j) {
+      const int k = Clip3(final_index + j - 2, 0, size - 1);
+      sum += kKernels[kernel_index][j] * edge[k];
+    }
+    dst_buffer[final_index] = RightShiftWithRounding(sum, 4);
+  }
+}
+
+constexpr int kMaxUpsampleSize = 16;
+
+// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and
+// interleaves the results with the original values. This implementation assumes
+// that it is safe to write the maximum number of upsampled pixels (32) to the
+// edge buffer, even when |size| is small.
+void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) {
+  assert(size % 4 == 0 && size <= kMaxUpsampleSize);
+  auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
+  uint8_t temp[kMaxUpsampleSize + 8];
+  temp[0] = temp[1] = pixel_buffer[-1];
+  memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size);
+  temp[size + 2] = pixel_buffer[size - 1];
+
+  pixel_buffer[-2] = temp[0];
+  const __m128i data = LoadUnaligned16(temp);
+  const __m128i src_lo = _mm_cvtepu8_epi16(data);
+  const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128());
+  const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3));
+  const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3));
+  __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo);
+  sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4));
+  sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6));
+  sum_lo = RightShiftWithRounding_S16(sum_lo, 4);
+  const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo),
+                                              _mm_srli_si128(data, 2));
+  StoreUnaligned16(pixel_buffer - 1, result_lo);
+  if (size > 8) {
+    const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16));
+    const __m128i src9_hi_extra =
+        _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3));
+    __m128i sum_hi =
+        _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi);
+    sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4));
+    sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6));
+    sum_hi = RightShiftWithRounding_S16(sum_hi, 4);
+    const __m128i result_hi =
+        _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10));
+    StoreUnaligned16(pixel_buffer + 15, result_hi);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter)
+  dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler)
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraEdgeInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraEdgeInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h
new file mode 100644
index 0000000..6ed4d40
--- /dev/null
+++ b/src/dsp/x86/intra_edge_sse4.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This
+// function is not thread-safe.
+void IntraEdgeInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
+#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler
+#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc
new file mode 100644
index 0000000..eb7e466
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -0,0 +1,1844 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+  return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+  return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreLo8(luma_ptr, result);
+  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+  return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreUnaligned16(luma_ptr, result);
+  return result;
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+template <int width, int height>
+void CflIntraPredictor_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const int kCflLumaBufferStrideLog2_16i = 5;
+  const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4) {
+        Store4(dst, res);
+      } else {
+        StoreLo8(dst, res);
+      }
+    } else {
+      __m128i next =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      res = _mm_packus_epi16(res, next);
+      StoreUnaligned16(dst, res);
+      if (width == 32) {
+        res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+        next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+        res = _mm_packus_epi16(res, next);
+        StoreUnaligned16(dst + 16, res);
+      }
+    }
+    dst += stride;
+  } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint8_t*>(source);
+  __m128i sum = _mm_setzero_si128();
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i samples;
+  int y = 0;
+  do {
+    samples = Load4(src);
+    src += stride;
+    int src_bytes;
+    memcpy(&src_bytes, src, 4);
+    samples = _mm_insert_epi32(samples, src_bytes, 1);
+    src += stride;
+    samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
+    StoreLo8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+    StoreHi8(luma_ptr, samples);
+    luma_ptr += kCflLumaBufferStride;
+
+    // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
+    // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
+    sum = _mm_add_epi16(sum, samples);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!is_inside) {
+    // Replicate the 2 high lanes.
+    samples = _mm_shuffle_epi32(samples, 0xee);
+    do {
+      StoreLo8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
+      StoreHi8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
+  sum = _mm_cvtepu16_epi32(sum);
+  sum = _mm_add_epi32(sum, sum_tmp);
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 2 /* log2 of width 4 */);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  static_cast<void>(max_luma_width);
+  constexpr int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  const int block_height = 1 << block_height_log2, block_width = 8;
+  const int visible_height = max_luma_height;
+  const int invisible_width = inside ? 0 : block_width - max_luma_width;
+  const int visible_width = max_luma_width;
+  const __m128i blend_mask =
+      inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  // Since the maximum height is 32, if we split them by parity, each one only
+  // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
+  // store them in 16 bits without casting to 32 bits.
+  __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
+  __m128i sum;
+  __m128i samples1;
+
+  int y = 0;
+  do {
+    __m128i samples0 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border0 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
+    }
+    src += stride;
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
+    StoreUnaligned16(luma_ptr, samples0);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_even = _mm_add_epi16(sum_even, samples0);
+
+    samples1 = LoadLo8(src);
+    if (!inside) {
+      const __m128i border1 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
+      samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
+    }
+    src += stride;
+    samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
+    StoreUnaligned16(luma_ptr, samples1);
+    luma_ptr += kCflLumaBufferStride;
+
+    sum_odd = _mm_add_epi16(sum_odd, samples1);
+    y += 2;
+  } while (y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height; y += 2) {
+      sum_even = _mm_add_epi16(sum_even, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+
+      sum_odd = _mm_add_epi16(sum_odd, samples1);
+      StoreUnaligned16(luma_ptr, samples1);
+      luma_ptr += kCflLumaBufferStride;
+    }
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
+                      _mm_cvtepu16_epi32(sum_even));
+  sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
+  sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      sum, block_height_log2 + 3 /* log2 of width 8 */);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+// This function will only work for block_width 16 and 32.
+template <int block_width_log2, int block_height_log2, bool inside>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+
+  const int visible_height = max_luma_height;
+  const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
+  const int invisible_width_16 = 16 - visible_width_16;
+  const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
+  const int visible_width_32 = inside ? 32 : max_luma_width;
+  const int invisible_width_32 = 32 - visible_width_32;
+  const __m128i blend_mask_32 =
+      MaskHighNBytes(std::min(16, invisible_width_32));
+
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = _mm_setzero_si128();
+
+  __m128i samples0, samples1;
+  __m128i samples2, samples3;
+  __m128i inner_sum_lo, inner_sum_hi;
+  int y = 0;
+  do {
+    // We can load uninitialized values here. Even though they are then masked
+    // off by blendv, MSAN doesn't model that behavior.
+    __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
+
+    if (!inside) {
+      const __m128i border16 =
+          _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
+      samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
+    }
+    samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
+    samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
+
+    StoreUnaligned16(luma_ptr, samples0);
+    StoreUnaligned16(luma_ptr + 8, samples1);
+    __m128i inner_sum = _mm_add_epi16(samples0, samples1);
+
+    if (block_width == 32) {
+      // We can load uninitialized values here. Even though they are then masked
+      // off by blendv, MSAN doesn't model that behavior.
+      __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
+      if (!inside) {
+        const __m128i border32 =
+            _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
+        samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
+      }
+      samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
+      samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
+
+      StoreUnaligned16(luma_ptr + 16, samples2);
+      StoreUnaligned16(luma_ptr + 24, samples3);
+      inner_sum = _mm_add_epi16(samples2, inner_sum);
+      inner_sum = _mm_add_epi16(samples3, inner_sum);
+    }
+
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    luma_ptr += kCflLumaBufferStride;
+    src += stride;
+  } while (++y < visible_height);
+
+  if (!inside) {
+    for (int y = visible_height; y < block_height;
+         luma_ptr += kCflLumaBufferStride, ++y) {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      StoreUnaligned16(luma_ptr, samples0);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+      StoreUnaligned16(luma_ptr + 8, samples1);
+      if (block_width == 32) {
+        StoreUnaligned16(luma_ptr + 16, samples2);
+        StoreUnaligned16(luma_ptr + 24, samples3);
+      }
+    }
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    for (int x = 0; x < block_width; x += 8) {
+      __m128i samples = LoadUnaligned16(&luma_ptr[x]);
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 1 << block_width_log2;
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+  do {
+    // Note that double sampling and converting to 16bit makes a row fill the
+    // vector.
+    const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
+    src += stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = 0;
+
+  do {
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row00);
+    src += stride;
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row10);
+    src += stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row20);
+    src += stride;
+    const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row30);
+    src += stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row40);
+    src += stride;
+    const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row50);
+    src += stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row60);
+    src += stride;
+    const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
+                                      : LastRowSamples(samples_row70);
+    src += stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y += 4;
+  } while (y < luma_height);
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint8_t*>(source);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  static_assert(max_luma_width <= 32, "");
+
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = 0;
+  do {
+    const uint8_t* src_next = src + stride;
+    const __m128i samples_row0_lo = LoadUnaligned16(src);
+    const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row0_lo, zero)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row0_hi)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row0_hi, zero)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row1_lo = LoadUnaligned16(src_next);
+    const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? _mm_unpackhi_epi8(samples_row1_lo, zero)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? _mm_cvtepu8_epi16(samples_row1_hi)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? _mm_unpackhi_epi8(samples_row1_hi, zero)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      sum = _mm_add_epi16(sum, wide_fill);
+      sum = _mm_add_epi16(sum, wide_fill);
+    }
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    src += stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (++y < luma_height);
+
+  // Begin second y section.
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_width, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_SSE4_1<5, 5>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_SSE4_1<32, 32>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  constexpr int kCflLumaBufferStrideLog2_16i = 5;
+  constexpr int kCflLumaBufferStrideLog2_128i =
+      kCflLumaBufferStrideLog2_16i - 3;
+  constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  const __m128i min = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+  stride >>= 1;
+
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    res = ClipEpi16(res, min, max);
+    if (width == 4) {
+      StoreLo8(dst, res);
+    } else if (width == 8) {
+      StoreUnaligned16(dst, res);
+    } else if (width == 16) {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+    } else {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+      const __m128i res_2 =
+          CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+      const __m128i res_3 =
+          CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+    }
+
+    dst += stride;
+  } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadHi8(LoadLo8(src), src + src_stride);
+    src += src_stride << 1;
+    sum = _mm_add_epi16(sum, samples);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples = _mm_unpackhi_epi64(samples, samples);
+    do {
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadLo8(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadUnaligned16(src);
+    src += src_stride;
+    sum = _mm_add_epi16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = _mm_add_epi16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadUnaligned16(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = zero;
+  __m128i inner_sum_lo, inner_sum_hi;
+  __m128i samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = LoadUnaligned16(src);
+    samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+                                        : LastRowResult(samples[0]);
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+                                          : LastRowResult(samples[2]);
+
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    do {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  __m128i samples_ext = zero;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        samples[idx] = LoadUnaligned16(&src[x]);
+        samples[idx] = _mm_slli_epi16(samples[idx], 3);
+        samples_ext = samples[idx];
+      } else {
+        samples[idx] = LastRowResult(samples_ext);
+      }
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+    }
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row0 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row1 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row3 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row5 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row7 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    src += src_stride;
+    const __m128i samples_row10 = LoadUnaligned16(src);
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row10);
+    src += src_stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = LoadUnaligned16(src);
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row20);
+    src += src_stride;
+    const __m128i samples_row30 = LoadUnaligned16(src);
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row30);
+    src += src_stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = LoadUnaligned16(src);
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row40);
+    src += src_stride;
+    const __m128i samples_row50 = LoadUnaligned16(src);
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row50);
+    src += src_stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = LoadUnaligned16(src);
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row60);
+    src += src_stride;
+    const __m128i samples_row70 = LoadUnaligned16(src);
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row70);
+    src += src_stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+                                                          source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
+    ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src + 16)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src + 24)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row10 = LoadUnaligned16(src_next);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src_next + 8)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src_next + 16)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src_next + 24)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      final_sum = _mm_add_epi32(
+          final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredCflInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_cfl_sse4.h b/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644
index 0000000..5d1a425
--- /dev/null
+++ b/src/dsp/x86/intrapred_cfl_sse4.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_directional_sse4.cc b/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644
index 0000000..e642aee
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.cc
@@ -0,0 +1,1478 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+                                    const uint8_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    return;
+  }
+  int y = 0;
+  do {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    dst += stride;
+    memcpy(dst, top + offset + 4, width);
+    dst += stride;
+    memcpy(dst, top + offset + 5, width);
+    dst += stride;
+    memcpy(dst, top + offset + 6, width);
+    dst += stride;
+    memcpy(dst, top + offset + 7, width);
+    dst += stride;
+
+    offset += 8;
+    y += 8;
+  } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+                                 const uint8_t* const top, const int height,
+                                 const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+                                    : _mm_set_epi64x(0, 0x0403030202010100);
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  int y = 0;
+  int top_x = xstep;
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadLo8(top + top_base_x);
+    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+    prod = RightShiftWithRounding_U16(prod, rounding_bits);
+    // Replace pixels from invalid range with top-right corner.
+    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+    Store4(dst, _mm_packus_epi16(prod, prod));
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+                                   const uint8_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    // Corner-only section of the row.
+    memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const int width, const int height,
+                                    const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled);
+    return;
+  }
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = 0;
+    do {
+      int top_base_x = top_x >> scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi8(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+        vals = _mm_maddubs_epi16(vals, shifts);
+        vals = RightShiftWithRounding_U16(vals, rounding_bits);
+        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (++y < height);
+    return;
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dest, top_row[max_base_x], width);
+        dest += stride;
+      }
+      return;
+    }
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    for (; x < width - 8;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(top_row + top_base_x);
+    } else {
+      const __m128i top_vals = LoadLo8(top_row + top_base_x);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    dest += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const int width, const int height,
+                                           const int xstep,
+                                           const bool upsampled_top) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+                          upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[4];
+  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadLo8(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadLo8(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    result_block[x] = _mm_packus_epi16(vals, vals);
+  }
+  const __m128i result = Transpose4x4_U8(result_block);
+  // This is result_row0.
+  Store4(dest, result);
+  dest += stride;
+  const int result_row1 = _mm_extract_epi32(result, 1);
+  memcpy(dest, &result_row1, sizeof(result_row1));
+  dest += stride;
+  const int result_row2 = _mm_extract_epi32(result, 2);
+  memcpy(dest, &result_row2, sizeof(result_row2));
+  dest += stride;
+  const int result_row3 = _mm_extract_epi32(result, 3);
+  memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler =
+      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[8];
+  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+  }
+  Transpose8x8_U16(result_block, result_block);
+  for (int y = 0; y < height; ++y) {
+    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int ystep,
+                                           const bool upsampled) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (width == 4 || height == 4) {
+    const ptrdiff_t stride4 = stride << 2;
+    if (upsampled) {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<true>(
+              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    } else {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+                                      ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    }
+    return;
+  }
+
+  const ptrdiff_t stride8 = stride << 3;
+  if (upsampled) {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<true, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  } else {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<false, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds) {
+  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds,
+                                     const __m128i& bounds_selector) {
+  const __m128i max_dest_x_vect =
+      _mm_shuffle_epi8(zone_bounds, bounds_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+                                                 const __m128i& shifts,
+                                                 const __m128i& sampler) {
+  const __m128i src_vals = LoadUnaligned16(source);
+  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+  vals = _mm_maddubs_epi16(vals, shifts);
+  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+  // Left_column and sampler are both offset by 15 so the indices are always
+  // positive.
+  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+  for (int y = 0; y < 4; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+    // can work as shuffle indices. Some values may be out of bounds, but their
+    // pred results will be masked over by top prediction.
+    sampler = _mm_add_epi8(sampler, positive_offset);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column + (y << upsample_shift), shifts, sampler);
+    Store4(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_set1_epi8(1);
+  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+  for (int y = 0; y < 8; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+    // Offset the relative index because ystep is negative in Zone 2 and shuffle
+    // indices must be nonnegative.
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    sampler = _mm_add_epi8(sampler, denegation);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+    // The specification adds (y << 6) to left_y, which is subject to
+    // upsampling, but this puts sampler indices out of the 0-15 range. It is
+    // equivalent to offset the source address by (y << upsample_shift) instead.
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+        sampler);
+    StoreLo8(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+  top_x -= xstep;
+
+  int top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  __m128i y_selector = _mm_set1_epi32(0x01000100);
+  const __m128i index_increment = _mm_set1_epi32(0x02020202);
+  for (int y = 0; y < height; ++y,
+           y_selector = _mm_add_epi8(y_selector, index_increment),
+           dest += stride) {
+    top_x -= xstep;
+    const int top_base_x = top_x >> scale_bits_x;
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const uint8_t* const left_column,
+                                    const int width, const int height,
+                                    const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride8 = stride << 3;
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute. This assumes minimum |xstep| is 3.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from left_column may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+  // Accumulate xstep across 8 rows.
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep8 = ystep << 3;
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+
+  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+  int x = 0;
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_top_only_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    DirectionalZone1_4xH(dst_x + 4, stride,
+                         top_row + ((x + 4) << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+  }
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+                                      const uint8_t* const top_row,
+                                      const uint8_t* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  const int xstep4 = xstep << 2;
+  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep4 = ystep << 2;
+  const int left_base_increment4 = ystep4 >> 6;
+  // This is guaranteed to be less than 64, but accumulation may bring it past
+  // 64 for higher x values.
+  const int ystep_remainder4 = ystep4 & 0x3F;
+  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which will go into the left_column offset.
+  // Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+  int x = 0;
+  // Loop over x for columns with a mixture of sources.
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+           left_y = _mm_add_epi16(left_y, increment_left4),
+           left_offset -= left_base_increment4) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute. Rounded up to the nearest multiple of 4.
+    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    // Loop over y for mixed rows.
+    for (; y < min_left_only_y;
+         y += 4, dst_x += stride4,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+         top_x -= xstep4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+          left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_4x4<upsampled_top>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left-only rows, if any.
+    for (; y < height; y += 4, dst_x += stride4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    }
+  }
+  // Loop over top-only columns, if any.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int xstep, const int ystep,
+                                           const bool upsampled_top,
+                                           const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  if (width == 4 || height == 4) {
+    if (upsampled_left) {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                              width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      }
+    } else {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                                width, height, xstep, ystep);
+      }
+    }
+    return;
+  }
+  if (upsampled_left) {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                          width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    }
+  } else {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                            width, height, xstep, ystep);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+                                    const uint16_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    return;
+  }
+  int y = height;
+  do {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+    dst += stride;
+
+    offset += 8;
+    y -= 8;
+  } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+                               const __m128i& shifts,
+                               const __m128i& top_indices,
+                               const __m128i& final_top_val,
+                               const __m128i& border_index) {
+  const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+  __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+  prod = _mm_hadd_epi16(prod, prod);
+  const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+  const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+  // Replace pixels from invalid range with top-right corner.
+  return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+                                 const uint16_t* const top, const int height,
+                                 const int xstep, const bool upsampled,
+                                 const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+  // only cmpgt is available.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int y = 0;
+  int top_x = xstep;
+  const __m128i max_shift = _mm_set1_epi16(32);
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadUnaligned16(top + top_base_x);
+    const __m128i pred =
+        CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+                        max_base_x_vect);
+    StoreLo8(dst, pred);
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+    const __m128i& top_vals_0, const __m128i& top_vals_1,
+    const __m128i& sampler, const __m128i& shifts,
+    const __m128i& top_indices = _mm_setzero_si128(),
+    const __m128i& final_top_val = _mm_setzero_si128(),
+    const __m128i& border_index = _mm_setzero_si128()) {
+  constexpr int scale_int_bits = 5;
+  const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+  const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+  const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+  const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+  const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+  const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+  if (check_border) {
+    const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+    // Replace pixels from invalid range with top-right corner.
+    return _mm_blendv_epi8(result, final_top_val, past_max);
+  }
+  return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+                                   const uint16_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled,
+                                   const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+  // to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+      const __m128i pred =
+          CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+      StoreUnaligned16(dest + x, pred);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to |top_base_x|, it is used to mask values
+  // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+  // which is not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+    // Corner-only section of the row.
+    Memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+    void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+    const int width, const int height, const int xstep, const bool upsampled) {
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  auto* dest = static_cast<uint16_t*>(dest_ptr);
+  stride /= sizeof(uint16_t);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  // Each base pixel paired with its following pixel, for hadd purposes.
+  const __m128i adjacency_shuffler = _mm_set_epi16(
+      0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+  // This is equivalent to not shuffling at all.
+  const __m128i identity_shuffler = _mm_set_epi16(
+      0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+  // This represents a trade-off between code size and speed. When upsampled
+  // is true, no shuffle is necessary. But to avoid in-loop branching, we
+  // would need 2 copies of the main function body.
+  const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+                         sampler);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled, sampler);
+    return;
+  }
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = height;
+    do {
+      int top_base_x = top_x >> index_scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi16(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+        const __m128i top_vals_1 =
+            LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+        const __m128i pred =
+            CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+        StoreUnaligned16(dest + x, pred);
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (--y != 0);
+    return;
+  }
+
+  // General case. Blocks with width less than 32 do not benefit from x-wise
+  // loop splitting, but do benefit from using memset on appropriate rows.
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+             top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_directional_sse4.h b/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644
index 0000000..b352450
--- /dev/null
+++ b/src/dsp/x86/intrapred_directional_sse4.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
diff --git a/src/dsp/x86/intrapred_filter_sse4.cc b/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644
index 0000000..a43a5cf
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.cc
@@ -0,0 +1,433 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* LIBGAV1_RESTRICT dst,
+                             const ptrdiff_t stride, const __m128i& pixels,
+                             const __m128i& taps_0_1, const __m128i& taps_2_3,
+                             const __m128i& taps_4_5, const __m128i& taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+  // |output_half| contains 8 partial sums for f0-f7.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* unused half */ output);
+  Store4(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* LIBGAV1_RESTRICT dest, ptrdiff_t stride,
+                      const uint8_t* LIBGAV1_RESTRICT const top_ptr,
+                      const uint8_t* LIBGAV1_RESTRICT const left_ptr,
+                      FilterIntraPredictor pred, const int height) {
+  // Two filter kernels per vector.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+  __m128i top = Load4(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+  //                   r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // Two sets of the same input pixels to apply two filters at once.
+  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = Load4(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+  //                         r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows, in
+  // order to fit the assumption that |left| has the next TL at position 8.
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+    left = LoadLo8(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = Load4(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case at this point, we can assume that |left| has the
+  // next TL at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+  }
+}
+
+void FilterIntraPredictor_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                                 ptrdiff_t stride,
+                                 const void* LIBGAV1_RESTRICT const top_row,
+                                 const void* LIBGAV1_RESTRICT const left_column,
+                                 FilterIntraPredictor pred, const int width,
+                                 const int height) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = LoadLo8(top_ptr - 1);
+  __m128i left = _mm_slli_si128(Load4(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = Load4(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = Load4(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    pixels = Load4(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = Load4(dst - stride);
+    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = Load4(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = Load4(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                       taps_6_7);
+      pixels = Load4(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                       taps_4_5, taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_filter_sse4.h b/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644
index 0000000..ce28f93
--- /dev/null
+++ b/src/dsp/x86/intrapred_filter_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc
new file mode 100644
index 0000000..b53ee8c
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -0,0 +1,2687 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Note these constants are duplicated from intrapred.cc to allow the compiler
+// to have visibility of the values. This helps reduce loads and in the
+// creation of the inverse weights.
+constexpr uint8_t kSmoothWeights[] = {
+#include "src/dsp/smooth_weights.inc"
+};
+
+template <int y_mask>
+inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
+                                      const __m128i& left,
+                                      const __m128i& weights,
+                                      const __m128i& scaled_top_right,
+                                      const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
+  const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
+                                     const __m128i& weights,
+                                     const __m128i& scaled_corner) {
+  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+  return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
+                                       const __m128i& pixels,
+                                       const __m128i& weights,
+                                       const __m128i& scaled_corner,
+                                       const __m128i& round) {
+  const __m128i pred_sum =
+      SmoothDirectionalSum8(pixels, weights, scaled_corner);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
+  StoreLo8(dest, _mm_packus_epi16(pred, pred));
+}
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+inline void WriteSmoothDirectionalSum16(
+    uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
+    const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
+    const __m128i& scaled_corner1, const __m128i& scaled_corner2,
+    const __m128i& round) {
+  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+  StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
+}
+
+template <int y_mask>
+inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
+                                const __m128i& top, const __m128i& left,
+                                const __m128i& weights_x,
+                                const __m128i& weights_y,
+                                const __m128i& scaled_bottom_left,
+                                const __m128i& scaled_top_right,
+                                const __m128i& round) {
+  const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
+  const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
+  const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
+  const __m128i scaled_bottom_left_y =
+      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
+  const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
+  const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
+  const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
+
+  // Equivalent to RightShiftWithRounding(pred[x][y], 9).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
+
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
+                              const int height, __m128i* pixels) {
+  if (height == 4) {
+    pixels[1] = Load4(left);
+  } else if (height == 8) {
+    pixels[1] = LoadLo8(left);
+  } else {
+    pixels[1] = LoadUnaligned16(left);
+  }
+
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  const __m128i top = _mm_cvtepu8_epi16(Load4(above));
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+  pixels[2] = _mm_set1_epi16(above[3]);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_h,
+                               __m128i* weight_w) {
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i x_weights = Load4(weight_array);
+  weight_h[0] = _mm_cvtepu8_epi16(x_weights);
+  weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i y_weights = LoadLo8(weight_array + 4);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i y_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(y_weights);
+    weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
+    weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
+  }
+}
+
+inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
+                               const __m128i* weight_x,
+                               uint8_t* LIBGAV1_RESTRICT dst,
+                               const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
+                                       : _mm_unpacklo_epi8(pixel[1], zero);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int i = 0; i < 8; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
+
+    __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
+    horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
+    __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
+
+    sum = _mm_add_epi32(vertical_pred, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 9);
+
+    sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
+    Store4(dst, sum);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+// The interleaving approach has some overhead that causes it to underperform in
+// the 4x4 case.
+void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  const __m128i scale = _mm_set1_epi32(256);
+  // Fourth short is top_row[3].
+  const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
+  // Fourth short is left_column[3].
+  const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  auto* dst = static_cast<uint8_t*>(dest);
+  // AV1 spec 7.11.2.6 (3) describes the sum:
+  // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
+  // scaled_bottom[y] This could be a loop, but for the immediate value in the
+  // shuffles.
+  WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
+                         scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
+                            scaled_bottom_left, scaled_top_right, scale);
+}
+
+void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[2];
+  LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+}
+
+void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i weights_x[1];
+  __m128i weights_y[4];
+  LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
+  __m128i pixels[3];
+  LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
+                              const uint8_t* LIBGAV1_RESTRICT left,
+                              const int height, __m128i* pixels) {
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
+  pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
+  pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
+
+  pixels[3] = _mm_set1_epi16(above[7]);
+
+  if (height == 4) {
+    pixels[2] = Load4(left);
+  } else if (height == 8) {
+    pixels[2] = LoadLo8(left);
+  } else if (height == 16) {
+    pixels[2] = LoadUnaligned16(left);
+  } else {
+    pixels[2] = LoadUnaligned16(left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = LoadUnaligned16(left + 16);
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
+                               const int height, __m128i* weight_w,
+                               __m128i* weight_h) {
+  const int offset = (height < 8) ? 0 : 4;
+  __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
+  weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+  const __m128i inverter = _mm_set1_epi16(256);
+  weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+
+  if (height == 4) {
+    loaded_weights = _mm_srli_si128(loaded_weights, 4);
+    __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
+    __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
+    weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
+    weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    const __m128i zero = _mm_setzero_si128();
+    loaded_weights = LoadUnaligned16(weight_array + 12);
+    weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
+    weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
+    weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
+    const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
+    weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
+    weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
+  }
+}
+
+inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
+                               const __m128i* weights_y, const int height,
+                               uint8_t* LIBGAV1_RESTRICT dst,
+                               const ptrdiff_t stride,
+                               const bool use_second_half) {
+  const __m128i round = _mm_set1_epi32(256);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
+                                       : _mm_unpacklo_epi8(pixels[2], zero);
+  __m128i y_select = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < height; ++i) {
+    const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
+    const __m128i interleaved_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    const __m128i vertical_sum0 =
+        _mm_madd_epi16(pixels[0], interleaved_weights);
+    const __m128i vertical_sum1 =
+        _mm_madd_epi16(pixels[1], interleaved_weights);
+
+    __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
+    horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
+    const __m128i horizontal_sum0 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[0]);
+    const __m128i horizontal_sum1 =
+        _mm_madd_epi16(horizontal_pixels, weights_x[1]);
+
+    __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
+    sum0 = _mm_add_epi32(sum0, round);
+    sum0 = _mm_srai_epi32(sum0, 9);
+
+    __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
+    sum1 = _mm_add_epi32(sum1, round);
+    sum1 = _mm_srai_epi32(sum1, 9);
+
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
+    StoreLo8(dst, sum0);
+    dst += stride;
+
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
+}
+
+void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT top_row,
+                      const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
+
+  __m128i weights_x[2], weights_y[2];
+  LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+}
+
+void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[4];
+  LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
+
+  __m128i weights_x[2], weights_y[4];
+  LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+}
+
+void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                       const ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT top_row,
+                       const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  __m128i pixels[8];
+  LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
+
+  __m128i weights_x[2], weights_y[8];
+  LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
+                     false);
+  dst += stride << 3;
+  WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
+                     true);
+}
+
+template <int width, int height>
+void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+               const void* LIBGAV1_RESTRICT const top_row,
+               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
+  const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value = _mm_set1_epi16(256);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
+  const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
+  const __m128i round = _mm_set1_epi32(256);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < height; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i scaled_bottom_left =
+        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i weight_left_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+    for (int x = 0; x < width; x += 8) {
+      const __m128i top_x = LoadLo8(top_ptr + x);
+      const __m128i weights_x = LoadLo8(sm_weights_w + x);
+      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
+      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+      // Here opposite weights and pixels are multiplied, where the order of
+      // interleaving is indicated in the names.
+      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+      // |scaled_bottom_left| is always scaled by the same weight each row, so
+      // we only derive |scaled_top_right| values here.
+      const __m128i inverted_weights_x =
+          _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
+      const __m128i scaled_top_right =
+          _mm_mullo_epi16(inverted_weights_x, top_right);
+      const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
+      const __m128i scaled_top_right_hi =
+          _mm_unpackhi_epi16(scaled_top_right, zero);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+      // The round value for RightShiftWithRounding was added with
+      // |scaled_bottom_left|.
+      pred_lo = _mm_srli_epi32(pred_lo, 9);
+      pred_hi = _mm_srli_epi32(pred_hi, 9);
+      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+    }
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
+                                const ptrdiff_t stride,
+                                const void* LIBGAV1_RESTRICT top_row,
+                                const void* LIBGAV1_RESTRICT left_column) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal4x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi32(top[3]);
+  const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi32(256);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi32(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+
+  left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
+  WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
+  dst += stride;
+  WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_select);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+}
+
+void SmoothHorizontal8x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal8x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x4_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  __m128i y_mask = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                              scaled_top_right1, scaled_top_right2, scale);
+}
+
+void SmoothHorizontal16x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal32x8_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  const __m128i left2 =
+      _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+  left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothHorizontal64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+  const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+    WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                scaled_top_right1, scaled_top_right2, scale);
+    WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                scaled_top_right3, scaled_top_right4, scale);
+    WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                scaled_top_right5, scaled_top_right6, scale);
+    WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                scaled_top_right7, scaled_top_right8, scale);
+    dst += stride;
+  }
+}
+
+void SmoothHorizontal64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  const __m128i top_right = _mm_set1_epi16(top[63]);
+  const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
+  const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  scale = _mm_set1_epi16(128);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
+                                  scaled_top_right1, scaled_top_right2, scale);
+      WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
+                                  scaled_top_right3, scaled_top_right4, scale);
+      WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
+                                  scaled_top_right5, scaled_top_right6, scale);
+      WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
+                                  scaled_top_right7, scaled_top_right8, scale);
+      dst += stride;
+    }
+  }
+}
+
+inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
+                                      const uint8_t* LIBGAV1_RESTRICT left,
+                                      const int height, __m128i* pixels) {
+  __m128i top = Load4(above);
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  top = _mm_cvtepu8_epi16(top);
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
+                                           weight_array,
+                                       const int height, __m128i* weights) {
+  const __m128i inverter = _mm_set1_epi16(256);
+
+  if (height == 4) {
+    const __m128i weight = Load4(weight_array);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = LoadLo8(weight_array + 4);
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else {
+    const __m128i weight = LoadUnaligned16(weight_array + 12);
+    const __m128i zero = _mm_setzero_si128();
+    weights[0] = _mm_cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(inverter, weights[2]);
+  }
+}
+
+inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
+                                   const int height,
+                                   uint8_t* LIBGAV1_RESTRICT dst,
+                                   const ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32(128);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int y = 0; y < height; ++y) {
+    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+    const __m128i alternate_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+    // The madd instruction yields four results of the form:
+    // (top_row[x] * weight[y] + corner * inverted_weight[y])
+    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, 8);
+    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+    Store4(dst, sum);
+    dst += stride;
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
+}
+
+void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 8, &pixels);
+
+  __m128i weights[2];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+}
+
+void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left = static_cast<const uint8_t*>(left_column);
+  const auto* const above = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i pixels;
+  LoadSmoothVerticalPixels4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
+
+  WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
+}
+
+void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
+}
+
+void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              const ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const top_row,
+                              const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
+                               scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
+  const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                              scaled_bottom_left_y, scaled_bottom_left_y,
+                              scale);
+}
+
+void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+  const __m128i scaled_bottom_left_lo =
+      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+  const __m128i scaled_bottom_left_hi =
+      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical16x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                               const ptrdiff_t stride,
+                               const void* LIBGAV1_RESTRICT const top_row,
+                               const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical32x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lo = LoadUnaligned16(top_ptr);
+  const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void SmoothVertical64x16_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+  const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  scale = _mm_set1_epi16(128);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x32_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
+  const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
+  const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  scale = _mm_set1_epi16(128);
+
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                scaled_bottom_left_y, scaled_bottom_left_y,
+                                scale);
+    dst += stride;
+  }
+}
+
+void SmoothVertical64x64_SSE4_1(
+    void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_row,
+    const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
+  const __m128i top_lolo = LoadUnaligned16(top_ptr);
+  const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
+  const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
+  const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scale = _mm_set1_epi16(256);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t* weights_base_ptr = kSmoothWeights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
+                                  scaled_bottom_left_y, scaled_bottom_left_y,
+                                  round);
+      dst += stride;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Smooth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Smooth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Smooth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Smooth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Smooth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Smooth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Smooth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      SmoothWxH<16, 4>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      SmoothWxH<16, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      SmoothWxH<16, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      SmoothWxH<16, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      SmoothWxH<16, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      SmoothWxH<32, 8>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      SmoothWxH<32, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      SmoothWxH<32, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      SmoothWxH<32, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      SmoothWxH<64, 16>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      SmoothWxH<64, 32>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      SmoothWxH<64, 64>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      SmoothVertical64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      SmoothHorizontal64x64_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else  // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredSmoothInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_smooth_sse4.h b/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644
index 0000000..9353371
--- /dev/null
+++ b/src/dsp/x86/intrapred_smooth_sse4.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
new file mode 100644
index 0000000..556afed
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -0,0 +1,2200 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// Utility Functions
+
+// This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
+// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
+// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
+// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
+// bits.
+constexpr int kThreeInverse = 0x5556;
+constexpr int kFiveInverse = 0x3334;
+template <int shiftk, int multiplier>
+inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
+  const __m128i interm = _mm_srli_epi32(dividend, shiftk);
+  return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1
+
+using DcSumFunc = __m128i (*)(const void* ref);
+using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
+using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
+                                    const __m128i column);
+// For copying an entire column across a block.
+using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
+                                 const void* column);
+
+// DC intra-predictors for non-square blocks.
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+struct DcPredFuncs_SSE4_1 {
+  DcPredFuncs_SSE4_1() = delete;
+
+  static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
+                    const void* left_column);
+  static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
+                 const void* left_column);
+};
+
+// Directional intra-predictors for square blocks.
+template <ColumnStoreFunc col_storefn>
+struct DirectionalPredFuncs_SSE4_1 {
+  DirectionalPredFuncs_SSE4_1() = delete;
+
+  static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
+                       const void* left_column);
+  static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
+                         const void* left_column);
+};
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                    const void* LIBGAV1_RESTRICT const top_row,
+                    const void* /*left_column*/) {
+  const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
+  const __m128i sum = top_sumfn(top_row);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* /*top_row*/,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
+  const __m128i sum = left_sumfn(left_column);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
+  storefn(dest, stride, dc);
+}
+
+template <int width_log2, int height_log2, DcSumFunc top_sumfn,
+          DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
+void DcPredFuncs_SSE4_1<
+    width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+    dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                 const void* LIBGAV1_RESTRICT const top_row,
+                 const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i rounder =
+      _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
+  const __m128i sum_top = top_sumfn(top_row);
+  const __m128i sum_left = left_sumfn(left_column);
+  const __m128i sum = _mm_add_epi32(sum_top, sum_left);
+  if (width_log2 == height_log2) {
+    const __m128i dc =
+        _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
+    storefn(dest, stride, dc);
+  } else {
+    const __m128i dc =
+        DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
+    storefn(dest, stride, dc);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DcPredFuncs_SSE4_1 directional predictors
+
+template <ColumnStoreFunc col_storefn>
+void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
+    void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+    const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
+  col_storefn(dest, stride, left_column);
+}
+
+}  // namespace
+
+//------------------------------------------------------------------------------
+namespace low_bitdepth {
+namespace {
+
+// |ref| points to 4 bytes containing 4 packed ints.
+inline __m128i DcSum4_SSE4_1(const void* const ref) {
+  const __m128i vals = Load4(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum8_SSE4_1(const void* const ref) {
+  const __m128i vals = LoadLo8(ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(vals, zero);
+}
+
+inline __m128i DcSum16_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals = LoadUnaligned16(ref);
+  const __m128i partial_sum = _mm_sad_epu8(vals, zero);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum32_SSE4_1(const void* const ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref);
+  const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+inline __m128i DcSum64_SSE4_1(const void* const ref) {
+  const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i vals1 = LoadUnaligned16(ref_ptr);
+  const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
+  const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
+  const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
+  const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
+  const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
+  __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
+  const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
+  const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
+  partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
+  return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
+}
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    Store4(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  Store4(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+}
+
+template <int height>
+inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+}
+
+template <int height>
+inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                               const __m128i dc) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreUnaligned16(dst, dc_dup);
+    StoreUnaligned16(dst + 16, dc_dup);
+    StoreUnaligned16(dst + 32, dc_dup);
+    StoreUnaligned16(dst + 48, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreUnaligned16(dst, dc_dup);
+  StoreUnaligned16(dst + 16, dc_dup);
+  StoreUnaligned16(dst + 32, dc_dup);
+  StoreUnaligned16(dst + 48, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
+// be copied for width N into dest.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  Store4(dst, dup32);
+  dst += stride;
+  const int row1 = _mm_extract_epi32(dup32, 1);
+  memcpy(dst, &row1, 4);
+  dst += stride;
+  const int row2 = _mm_extract_epi32(dup32, 2);
+  memcpy(dst, &row2, 4);
+  dst += stride;
+  const int row3 = _mm_extract_epi32(dup32, 3);
+  memcpy(dst, &row3, 4);
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = Load4(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  dst += stride4;
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
+  const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+  const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+  const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lolo);
+  dst += stride4;
+  const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+  writefn(dst, stride, col_dup32_lohi);
+  dst += stride4;
+  const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hilo);
+  dst += stride4;
+  const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+  writefn(dst, stride, col_dup32_hihi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
+    const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
+    const __m128i col_dup32_lolo =
+        _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lolo);
+    dst += stride4;
+    const __m128i col_dup32_lohi =
+        _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
+    writefn(dst, stride, col_dup32_lohi);
+    dst += stride4;
+    const __m128i col_dup32_hilo =
+        _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hilo);
+    dst += stride4;
+    const __m128i col_dup32_hihi =
+        _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
+    writefn(dst, stride, col_dup32_hihi);
+    dst += stride4;
+  }
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+  // shiftk is the smaller of width_log2 and height_log2.
+  // dc_mult corresponds to the ratio of the smaller block size to the larger.
+  using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
+  using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
+
+  using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
+  using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
+                                  DcStore8xH_SSE4_1<8>, 0, 0>;
+  using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
+                                   DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
+  using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
+                                   DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
+
+  using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
+                                   DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
+  using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
+  using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore16xH_SSE4_1<16>, 0, 0>;
+  using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
+  using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
+
+  using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
+                                   DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
+  using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
+  using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore32xH_SSE4_1<32>, 0, 0>;
+  using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
+
+  using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
+                                    DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
+  using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
+                                    DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
+  using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
+                                    DcStore64xH_SSE4_1<64>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+template <int y_mask>
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
+
+  const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
+  const __m128i top_left_dists =
+      _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  // The sequence of 32-bit packed operations was found (see CL via blame) to
+  // outperform 16-bit operations, despite the availability of the packus
+  // function, when tested on a Xeon E7 v3.
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  const __m128i pred = _mm_shuffle_epi8(
+      _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
+  Store4(dst, pred);
+}
+
+// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
+// we would be able to do all of these operations as epi8 for a 16-pixel version
+// of this function. Still, since lefts_y is just a vector of duplicates, it
+// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
+// for the blends.
+template <int y_mask>
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                            const __m128i& left, const __m128i& top_lefts,
+                            const __m128i& top_dists, const __m128i& left_dists,
+                            const __m128i& top_left_diffs) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
+
+  const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
+  const __m128i top_left_dists =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
+  not_select_left =
+      _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
+  const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
+
+  const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
+
+  const __m128i pred = _mm_packus_epi16(
+      _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+// |top| is an epi8 of length 16
+// |left| is epi8 of unknown length, as y_mask specifies access
+// |top_lefts| is an epi8 of 16 duplicates
+// |top_dists| is an epi8 of unknown length, as y_mask specifies access
+// |left_dists| is an epi8 of length 16
+// |left_dists_lo| is an epi16 of length 8
+// |left_dists_hi| is an epi16 of length 8
+// |top_left_diffs_lo| is an epi16 of length 8
+// |top_left_diffs_hi| is an epi16 of length 8
+// The latter two vectors are epi16 because their values may reach -510.
+// |left_dists| is provided alongside its spread out version because it doesn't
+// change between calls and interacts with both kinds of packing.
+template <int y_mask>
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
+                             const __m128i& left, const __m128i& top_lefts,
+                             const __m128i& top_dists,
+                             const __m128i& left_dists,
+                             const __m128i& left_dists_lo,
+                             const __m128i& left_dists_hi,
+                             const __m128i& top_left_diffs_lo,
+                             const __m128i& top_left_diffs_hi) {
+  const __m128i select_y = _mm_set1_epi32(y_mask);
+  const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
+  const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
+  const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
+  const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
+
+  const __m128i top_left_dists_lo =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
+  const __m128i top_left_dists_hi =
+      _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
+
+  const __m128i left_gt_top_left_lo = _mm_packs_epi16(
+      _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
+  const __m128i left_gt_top_left_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
+                      /* unused second arg for pack */ left_dists_hi);
+  const __m128i left_gt_top_left = _mm_alignr_epi8(
+      left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
+
+  const __m128i not_select_top_lo =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top_hi =
+      _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
+                      /* unused second arg for pack */ top_dists_y16);
+  const __m128i not_select_top = _mm_alignr_epi8(
+      not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
+
+  const __m128i left_leq_top =
+      _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
+  const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
+
+  // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
+  // operation is unavailable, so the logic for selecting top, left, or
+  // top_left is inverted.
+  const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
+
+  const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
+  __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
+  top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
+  top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
+  const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
+
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
+}
+
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi32(left);
+  const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
+                        top_left_diff);
+}
+
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_0 = _mm_cvtepu8_epi32(left);
+  const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
+  const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
+  const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
+
+  const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
+  const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
+  const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
+  const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
+  const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                     top_left_diff);
+  dst += stride;
+  WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+  dst += stride;
+  WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
+                        top_left_diff);
+}
+
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                     const void* LIBGAV1_RESTRICT const top_row,
+                     const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
+                              top_left_diff);
+}
+
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i left_lo = _mm_cvtepu8_epi16(left);
+  const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
+  const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+  const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
+  const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
+  const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
+  const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+  dst += stride;
+  WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
+                              left_dists, top_left_diff);
+}
+
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
+  Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
+}
+
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = Load4(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+// Inlined for calling with offsets in larger transform sizes, mainly to
+// preserve top_left.
+inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                           const uint8_t top_left, const __m128i top,
+                           const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top_left,
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
+                     const __m128i top, const __m128i left) {
+  const __m128i top_lo = _mm_cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  const __m128i top_lefts16 = _mm_set1_epi16(top_left);
+  const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
+
+  // Given that the spec defines "base" as top[x] + left[y] - top[-1],
+  // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
+  // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
+
+  const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
+                                          _mm_subs_epu8(top_lefts8, top));
+  const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
+  const __m128i left_dists_hi =
+      _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
+  const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
+                                         _mm_subs_epu8(top_lefts8, left));
+
+  const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
+  const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
+  const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
+                      left_dists_lo, left_dists_hi, top_left_diff_lo,
+                      top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+  dst += stride;
+  WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
+                               left_dists, left_dists_lo, left_dists_hi,
+                               top_left_diff_lo, top_left_diff_hi);
+}
+
+void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
+}
+
+void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
+}
+
+void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const ptrdiff_t stride16 = stride << 4;
+  const __m128i left_0 = LoadUnaligned16(left_column);
+  const __m128i top = LoadUnaligned16(top_row);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top, left_0);
+  dst += stride16;
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  WritePaeth16x16(dst, stride, top_left, top, left_1);
+  dst += stride16;
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  WritePaeth16x16(dst, stride, top_left, top, left_2);
+  dst += stride16;
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  WritePaeth16x16(dst, stride, top_left, top, left_3);
+}
+
+void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                      const void* LIBGAV1_RESTRICT const top_row,
+                      const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadLo8(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x8(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_row);
+  const uint8_t top_left = top_ptr[-1];
+  auto* const dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+}
+
+void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+}
+
+void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+}
+
+void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const __m128i left = LoadUnaligned16(left_column);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
+}
+
+void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+}
+
+void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+                       const void* LIBGAV1_RESTRICT const top_row,
+                       const void* LIBGAV1_RESTRICT const left_column) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  const __m128i left_0 = LoadUnaligned16(left_ptr);
+  const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
+  const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
+  const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const __m128i top_0 = LoadUnaligned16(top_ptr);
+  const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
+  const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
+  const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
+  const uint8_t top_left = top_ptr[-1];
+  auto* dst = static_cast<uint8_t*>(dest);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_0);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_1);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_2);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
+  dst += (stride << 4);
+  WritePaeth16x16(dst, stride, top_left, top_0, left_3);
+  WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
+  WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
+  WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
+      DcDefs::_4x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
+      DcDefs::_4x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
+      DcDefs::_8x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
+      DcDefs::_8x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
+      DcDefs::_8x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
+      DcDefs::_8x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
+      DcDefs::_16x4::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
+      DcDefs::_16x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
+      DcDefs::_16x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
+      DcDefs::_16x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
+      DcDefs::_16x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
+      DcDefs::_32x8::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
+      DcDefs::_32x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
+      DcDefs::_32x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
+      DcDefs::_32x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
+      DcDefs::_64x16::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
+      DcDefs::_64x32::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
+      DcDefs::_64x64::DcTop;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
+      DcDefs::_4x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
+      DcDefs::_4x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
+      DcDefs::_8x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
+      DcDefs::_8x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
+      DcDefs::_8x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
+      DcDefs::_8x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
+      DcDefs::_16x4::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
+      DcDefs::_16x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
+      DcDefs::_16x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
+      DcDefs::_16x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
+      DcDefs::_16x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
+      DcDefs::_32x8::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
+      DcDefs::_32x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
+      DcDefs::_32x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
+      DcDefs::_32x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
+      DcDefs::_64x16::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
+      DcDefs::_64x32::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
+      DcDefs::_64x64::DcLeft;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
+      DcDefs::_4x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
+      DcDefs::_4x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
+      DcDefs::_8x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
+      DcDefs::_8x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
+      DcDefs::_8x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
+      DcDefs::_8x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
+      DcDefs::_16x4::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
+      DcDefs::_16x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
+      DcDefs::_16x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
+      DcDefs::_16x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
+      DcDefs::_16x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
+      DcDefs::_32x8::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
+      DcDefs::_32x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
+      DcDefs::_32x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
+      DcDefs::_32x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
+      DcDefs::_64x16::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
+      DcDefs::_64x32::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
+      DcDefs::_64x64::Dc;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
+      Paeth4x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
+      Paeth4x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
+      Paeth4x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
+      Paeth8x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
+      Paeth8x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
+      Paeth8x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
+      Paeth8x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
+      Paeth16x4_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
+      Paeth16x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
+      Paeth16x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
+      Paeth16x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
+      Paeth16x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
+      Paeth32x8_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
+      Paeth32x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
+      Paeth32x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
+      Paeth32x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
+      Paeth64x16_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
+      Paeth64x32_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
+      Paeth64x64_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}  // NOLINT(readability/fn_size)
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+template <int height>
+inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
+                              const __m128i dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
+  int y = height - 1;
+  auto* dst = static_cast<uint8_t*>(dest);
+  do {
+    StoreLo8(dst, dc_dup);
+    dst += stride;
+  } while (--y != 0);
+  StoreLo8(dst, dc_dup);
+}
+
+// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
+// identical shorts that need N total copies written into dest. The unpacking
+// works the same as in the 8bpp case, except that each 32-bit unit needs twice
+// as many copies.
+inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  auto* dst = static_cast<uint8_t*>(dest);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
+  dst += stride;
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+  _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
+  dst += stride;
+  _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
+}
+
+inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
+                              const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+}
+
+inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+}
+
+inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
+}
+
+inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
+                               const __m128i dup32) {
+  const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
+  const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
+
+  auto* dst = static_cast<uint8_t*>(dest);
+  const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
+  }
+  dst += stride;
+  const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
+  }
+  dst += stride;
+  const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
+  }
+  dst += stride;
+  const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
+  for (int x = 0; x < 128; x += 16) {
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
+  }
+}
+
+// ColStoreN<height> copies each of the |height| values in |column| across its
+// corresponding row in dest.
+template <WriteDuplicateFunc writefn>
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = LoadLo8(column);
+  const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
+  writefn(dest, stride, col_dup32);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                             ptrdiff_t stride,
+                             const void* LIBGAV1_RESTRICT const column) {
+  const __m128i col_data = LoadUnaligned16(column);
+  const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+  const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+  auto* dst = static_cast<uint8_t*>(dest);
+  writefn(dst, stride, col_dup32_lo);
+  const ptrdiff_t stride4 = stride << 2;
+  dst += stride4;
+  writefn(dst, stride, col_dup32_hi);
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 32; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 64; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+template <WriteDuplicateFunc writefn>
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+                              ptrdiff_t stride,
+                              const void* LIBGAV1_RESTRICT const column) {
+  const ptrdiff_t stride4 = stride << 2;
+  auto* dst = static_cast<uint8_t*>(dest);
+  for (int y = 0; y < 128; y += 16) {
+    const __m128i col_data =
+        LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
+    const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
+    const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
+    writefn(dst, stride, col_dup32_lo);
+    dst += stride4;
+    writefn(dst, stride, col_dup32_hi);
+    dst += stride4;
+  }
+}
+
+// |ref| points to 8 bytes containing 4 packed int16 values.
+inline __m128i DcSum4_SSE4_1(const void* ref) {
+  const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
+  const __m128i ones = _mm_set1_epi16(1);
+
+  // half_sum[31:0]  = a1+a2
+  // half_sum[63:32] = a3+a4
+  const __m128i half_sum = _mm_madd_epi16(vals, ones);
+  // Place half_sum[63:32] in shift_sum[31:0].
+  const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
+  return _mm_add_epi32(half_sum, shift_sum);
+}
+
+struct DcDefs {
+  DcDefs() = delete;
+
+  using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
+                                  DcStore4xH_SSE4_1<4>, 0, 0>;
+};
+
+struct DirDefs {
+  DirDefs() = delete;
+
+  using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
+  using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
+  using _4x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
+  using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
+  using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
+  using _8x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
+  using _8x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
+  using _16x4 =
+      DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
+  using _16x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
+  using _16x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
+  using _16x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
+  using _16x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
+  using _32x8 =
+      DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
+  using _32x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
+  using _32x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
+  using _32x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
+  using _64x16 =
+      DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
+  using _64x32 =
+      DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
+  using _64x64 =
+      DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
+};
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
+      DcDefs::_4x4::DcTop;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
+      DcDefs::_4x4::DcLeft;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
+      DcDefs::_4x4::Dc;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
+      DirDefs::_4x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      DirDefs::_4x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      DirDefs::_4x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
+      DirDefs::_8x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      DirDefs::_8x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
+      DirDefs::_8x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      DirDefs::_8x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
+      DirDefs::_16x4::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      DirDefs::_16x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
+      DirDefs::_16x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
+      DirDefs::_16x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
+      DirDefs::_16x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
+      DirDefs::_32x8::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
+      DirDefs::_32x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
+      DirDefs::_32x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      DirDefs::_32x64::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
+      DirDefs::_64x16::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
+      DirDefs::_64x32::Horizontal;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
+      DirDefs::_64x64::Horizontal;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h
new file mode 100644
index 0000000..1f6f30a
--- /dev/null
+++ b/src/dsp/x86/intrapred_sse4.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc
new file mode 100644
index 0000000..e9ceb87
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.cc
@@ -0,0 +1,3053 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+template <int store_width, int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* LIBGAV1_RESTRICT dst,
+                                    int32_t stride, int32_t idx,
+                                    const __m128i* s) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (store_width == 16) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreUnaligned16(&dst[i * stride + idx], s[i]);
+      StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+  if (store_width == 8) {
+    for (int i = 0; i < store_count; i += 4) {
+      StoreLo8(&dst[i * stride + idx], s[i]);
+      StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]);
+      StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]);
+      StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]);
+    }
+  }
+}
+
+template <int load_width, int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* LIBGAV1_RESTRICT src,
+                                   int32_t stride, int32_t idx, __m128i* x) {
+  // NOTE: It is expected that the compiler will unroll these loops.
+  if (load_width == 16) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadUnaligned16(&src[i * stride + idx]);
+      x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]);
+    }
+  }
+  if (load_width == 8) {
+    for (int i = 0; i < load_count; i += 4) {
+      x[i] = LoadLo8(&src[i * stride + idx]);
+      x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]);
+      x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]);
+      x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]);
+    }
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1);
+  const __m128i y = _mm_packs_epi32(y1, y1);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+// Butterfly rotate 8 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i psin_pcos = _mm_set1_epi32(
+      static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
+  // -sin cos, -sin cos, -sin cos, -sin cos
+  const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
+  const __m128i ba = _mm_unpacklo_epi16(*a, *b);
+  const __m128i ab = _mm_unpacklo_epi16(*b, *a);
+  const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b);
+  const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a);
+  const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
+  const __m128i y0 = _mm_madd_epi16(ab, psin_pcos);
+  const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos);
+  const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos);
+  const __m128i x1 = RightShiftWithRounding_S32(x0, 12);
+  const __m128i y1 = RightShiftWithRounding_S32(y0, 12);
+  const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12);
+  const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12);
+  const __m128i x = _mm_packs_epi32(x1, x1_hi);
+  const __m128i y = _mm_packs_epi32(y1, y1_hi);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(-(sin128 << 3));
+  const __m128i x = _mm_mulhrs_epi16(*b, psin);
+  const __m128i y = _mm_mulhrs_epi16(*b, pcos);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a,
+                                                          __m128i* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int16_t cos128 = Cos128(angle);
+  const int16_t sin128 = Sin128(angle);
+  const __m128i pcos = _mm_set1_epi16(cos128 << 3);
+  const __m128i psin = _mm_set1_epi16(sin128 << 3);
+  const __m128i x = _mm_mulhrs_epi16(*a, pcos);
+  const __m128i y = _mm_mulhrs_epi16(*a, psin);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) {
+  __m128i x, y;
+  if (flip) {
+    y = _mm_adds_epi16(*b, *a);
+    x = _mm_subs_epi16(*b, *a);
+  } else {
+    x = _mm_adds_epi16(*a, *b);
+    y = _mm_subs_epi16(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle,
+                                       bool flip);
+
+LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual,
+                                            const __m128i v_row_shift_add,
+                                            const __m128i v_row_shift) {
+  const __m128i k7ffd = _mm_set1_epi16(0x7ffd);
+  // The max row_shift is 2, so int16_t values greater than 0x7ffd may
+  // overflow.  Generate a mask for this case.
+  const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd);
+  const __m128i x = _mm_add_epi16(residual, v_row_shift_add);
+  // Assume int16_t values.
+  const __m128i a = _mm_sra_epi16(x, v_row_shift);
+  // Assume uint16_t values.
+  const __m128i b = _mm_srl_epi16(x, v_row_shift);
+  // Select the correct shifted value.
+  return _mm_blendv_epi8(a, b, mask);
+}
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_src =
+      (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const int16_t cos128 = Cos128(32);
+  const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3));
+
+  // Expand to 32 bits to prevent int16_t overflows during the shift add.
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_cvtepi16_epi32(xy);
+  const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8));
+  const __m128i b = _mm_add_epi32(a, v_row_shift_add);
+  const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add);
+  const __m128i c = _mm_sra_epi32(b, v_row_shift);
+  const __m128i c1 = _mm_sra_epi32(b1, v_row_shift);
+  const __m128i xy_shifted = _mm_packs_epi32(c, c1);
+
+  if (width == 4) {
+    StoreLo8(dst, xy_shifted);
+  } else {
+    for (int i = 0; i < width; i += 8) {
+      StoreUnaligned16(dst, xy_shifted);
+      dst += 8;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const int16_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const __m128i v_src = LoadLo8(dst);
+    const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+    StoreLo8(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadUnaligned16(&dst[i]);
+      const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
+      StoreUnaligned16(&dst[i], xy);
+      i += 8;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  HadamardRotation(&s[0], &s[3], false);
+  HadamardRotation(&s[1], &s[2], false);
+}
+
+// Process 4 dct4 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[4], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(s, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(s, s);
+    }
+    StoreDst<8, 4>(dst, step, 0, s);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false);
+  HadamardRotation(&s[6], &s[7], true);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  HadamardRotation(&s[0], &s[7], false);
+  HadamardRotation(&s[1], &s[6], false);
+  HadamardRotation(&s[2], &s[5], false);
+  HadamardRotation(&s[3], &s[4], false);
+}
+
+// Process dct8 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(s, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false);
+  HadamardRotation(&s[10], &s[11], true);
+  HadamardRotation(&s[12], &s[13], false);
+  HadamardRotation(&s[14], &s[15], true);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false);
+  HadamardRotation(&s[9], &s[10], false);
+  HadamardRotation(&s[12], &s[15], true);
+  HadamardRotation(&s[13], &s[14], true);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  HadamardRotation(&s[0], &s[15], false);
+  HadamardRotation(&s[1], &s[14], false);
+  HadamardRotation(&s[2], &s[13], false);
+  HadamardRotation(&s[3], &s[12], false);
+  HadamardRotation(&s[4], &s[11], false);
+  HadamardRotation(&s[5], &s[10], false);
+  HadamardRotation(&s[6], &s[9], false);
+  HadamardRotation(&s[7], &s[8], false);
+}
+
+// Process dct16 rows or columns, depending on the transpose flag.
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(s, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&s[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, s);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&s[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, s);
+    }
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false);
+  HadamardRotation(&s[18], &s[19], true);
+  HadamardRotation(&s[20], &s[21], false);
+  HadamardRotation(&s[22], &s[23], true);
+  HadamardRotation(&s[24], &s[25], false);
+  HadamardRotation(&s[26], &s[27], true);
+  HadamardRotation(&s[28], &s[29], false);
+  HadamardRotation(&s[30], &s[31], true);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false);
+  HadamardRotation(&s[17], &s[18], false);
+  HadamardRotation(&s[20], &s[23], true);
+  HadamardRotation(&s[21], &s[22], true);
+  HadamardRotation(&s[24], &s[27], false);
+  HadamardRotation(&s[25], &s[26], false);
+  HadamardRotation(&s[28], &s[31], true);
+  HadamardRotation(&s[29], &s[30], true);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false);
+  HadamardRotation(&s[17], &s[22], false);
+  HadamardRotation(&s[18], &s[21], false);
+  HadamardRotation(&s[19], &s[20], false);
+  HadamardRotation(&s[24], &s[31], true);
+  HadamardRotation(&s[25], &s[30], true);
+  HadamardRotation(&s[26], &s[29], true);
+  HadamardRotation(&s[27], &s[28], true);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  HadamardRotation(&s[0], &s[31], false);
+  HadamardRotation(&s[1], &s[30], false);
+  HadamardRotation(&s[2], &s[29], false);
+  HadamardRotation(&s[3], &s[28], false);
+  HadamardRotation(&s[4], &s[27], false);
+  HadamardRotation(&s[5], &s[26], false);
+  HadamardRotation(&s[6], &s[25], false);
+  HadamardRotation(&s[7], &s[24], false);
+  HadamardRotation(&s[8], &s[23], false);
+  HadamardRotation(&s[9], &s[22], false);
+  HadamardRotation(&s[10], &s[21], false);
+  HadamardRotation(&s[11], &s[20], false);
+  HadamardRotation(&s[12], &s[19], false);
+  HadamardRotation(&s[13], &s[18], false);
+  HadamardRotation(&s[14], &s[17], false);
+  HadamardRotation(&s[15], &s[16], false);
+}
+
+// Process dct32 rows or columns, depending on the transpose flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
+                                        const bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[32], x[32];
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_8>(s);
+  Dct8Stages<ButterflyRotation_8>(s);
+  Dct16Stages<ButterflyRotation_8>(s);
+  Dct32Stages<ButterflyRotation_8>(s);
+
+  if (transpose) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 32>(dst, step, 0, s);
+  }
+}
+
+// Allow the compiler to call this function instead of force inlining. Tests
+// show the performance is slightly faster.
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[64], x[32];
+
+  if (transpose) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, idx, input);
+      Transpose8x8_U16(input, &x[idx]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<16, 32>(dst, step, 0, x);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct8Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct16Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+  Dct32Stages<ButterflyRotation_8, /*is_fast_butterfly=*/true>(s);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false);
+  HadamardRotation(&s[34], &s[35], true);
+  HadamardRotation(&s[36], &s[37], false);
+  HadamardRotation(&s[38], &s[39], true);
+  HadamardRotation(&s[40], &s[41], false);
+  HadamardRotation(&s[42], &s[43], true);
+  HadamardRotation(&s[44], &s[45], false);
+  HadamardRotation(&s[46], &s[47], true);
+  HadamardRotation(&s[48], &s[49], false);
+  HadamardRotation(&s[50], &s[51], true);
+  HadamardRotation(&s[52], &s[53], false);
+  HadamardRotation(&s[54], &s[55], true);
+  HadamardRotation(&s[56], &s[57], false);
+  HadamardRotation(&s[58], &s[59], true);
+  HadamardRotation(&s[60], &s[61], false);
+  HadamardRotation(&s[62], &s[63], true);
+
+  // stage 7.
+  ButterflyRotation_8(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_8(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false);
+  HadamardRotation(&s[33], &s[34], false);
+  HadamardRotation(&s[36], &s[39], true);
+  HadamardRotation(&s[37], &s[38], true);
+  HadamardRotation(&s[40], &s[43], false);
+  HadamardRotation(&s[41], &s[42], false);
+  HadamardRotation(&s[44], &s[47], true);
+  HadamardRotation(&s[45], &s[46], true);
+  HadamardRotation(&s[48], &s[51], false);
+  HadamardRotation(&s[49], &s[50], false);
+  HadamardRotation(&s[52], &s[55], true);
+  HadamardRotation(&s[53], &s[54], true);
+  HadamardRotation(&s[56], &s[59], false);
+  HadamardRotation(&s[57], &s[58], false);
+  HadamardRotation(&s[60], &s[63], true);
+  HadamardRotation(&s[61], &s[62], true);
+
+  // stage 16.
+  ButterflyRotation_8(&s[61], &s[34], 56, true);
+  ButterflyRotation_8(&s[60], &s[35], 56, true);
+  ButterflyRotation_8(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_8(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_8(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false);
+  HadamardRotation(&s[33], &s[38], false);
+  HadamardRotation(&s[34], &s[37], false);
+  HadamardRotation(&s[35], &s[36], false);
+  HadamardRotation(&s[40], &s[47], true);
+  HadamardRotation(&s[41], &s[46], true);
+  HadamardRotation(&s[42], &s[45], true);
+  HadamardRotation(&s[43], &s[44], true);
+  HadamardRotation(&s[48], &s[55], false);
+  HadamardRotation(&s[49], &s[54], false);
+  HadamardRotation(&s[50], &s[53], false);
+  HadamardRotation(&s[51], &s[52], false);
+  HadamardRotation(&s[56], &s[63], true);
+  HadamardRotation(&s[57], &s[62], true);
+  HadamardRotation(&s[58], &s[61], true);
+  HadamardRotation(&s[59], &s[60], true);
+
+  // stage 25.
+  ButterflyRotation_8(&s[59], &s[36], 48, true);
+  ButterflyRotation_8(&s[58], &s[37], 48, true);
+  ButterflyRotation_8(&s[57], &s[38], 48, true);
+  ButterflyRotation_8(&s[56], &s[39], 48, true);
+  ButterflyRotation_8(&s[55], &s[40], 112, true);
+  ButterflyRotation_8(&s[54], &s[41], 112, true);
+  ButterflyRotation_8(&s[53], &s[42], 112, true);
+  ButterflyRotation_8(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false);
+  HadamardRotation(&s[33], &s[46], false);
+  HadamardRotation(&s[34], &s[45], false);
+  HadamardRotation(&s[35], &s[44], false);
+  HadamardRotation(&s[36], &s[43], false);
+  HadamardRotation(&s[37], &s[42], false);
+  HadamardRotation(&s[38], &s[41], false);
+  HadamardRotation(&s[39], &s[40], false);
+  HadamardRotation(&s[48], &s[63], true);
+  HadamardRotation(&s[49], &s[62], true);
+  HadamardRotation(&s[50], &s[61], true);
+  HadamardRotation(&s[51], &s[60], true);
+  HadamardRotation(&s[52], &s[59], true);
+  HadamardRotation(&s[53], &s[58], true);
+  HadamardRotation(&s[54], &s[57], true);
+  HadamardRotation(&s[55], &s[56], true);
+
+  // stage 30.
+  ButterflyRotation_8(&s[55], &s[40], 32, true);
+  ButterflyRotation_8(&s[54], &s[41], 32, true);
+  ButterflyRotation_8(&s[53], &s[42], 32, true);
+  ButterflyRotation_8(&s[52], &s[43], 32, true);
+  ButterflyRotation_8(&s[51], &s[44], 32, true);
+  ButterflyRotation_8(&s[50], &s[45], 32, true);
+  ButterflyRotation_8(&s[49], &s[46], 32, true);
+  ButterflyRotation_8(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false);
+  }
+  //-- end dct 64 stages
+
+  if (transpose) {
+    for (int idx = 0; idx < 64; idx += 8) {
+      __m128i output[8];
+      Transpose8x8_U16(&s[idx], output);
+      StoreDst<16, 8>(dst, step, idx, output);
+    }
+  } else {
+    StoreDst<16, 64>(dst, step, 0, s);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+
+template <bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[4];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<8, 8>(dst, step, 0, input);
+      Transpose4x8To8x4_U16(input, x);
+    } else {
+      LoadSrc<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    LoadSrc<8, 4>(dst, step, 0, x);
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+  }
+
+  const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]);
+  const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]);
+  const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]);
+  const __m128i kAdst4Multiplier_m0_1 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[1]) |
+                     (static_cast<uint32_t>(-kAdst4Multiplier[0]) << 16));
+  const __m128i kAdst4Multiplier_3_0 =
+      _mm_set1_epi32(static_cast<uint16_t>(kAdst4Multiplier[0]) |
+                     (static_cast<uint32_t>(kAdst4Multiplier[3]) << 16));
+
+  // stage 1.
+  const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]);
+  const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]);
+  const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]);
+  const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]);
+  const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]);
+
+  s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1);
+  s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3);
+
+  // stage 2.
+  // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2]
+  const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2);
+  const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2);
+  const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2);
+
+  // stage 3.
+  s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0);
+  s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1);
+  s[2] = b7;
+  s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2);
+
+  // stage 4.
+  s[0] = _mm_add_epi32(s[0], s[5]);
+  s[1] = _mm_sub_epi32(s[1], s[6]);
+
+  // stages 5 and 6.
+  x[0] = _mm_add_epi32(s[0], s[3]);
+  x[1] = _mm_add_epi32(s[1], s[3]);
+  x[2] = _mm_add_epi32(s[0], s[1]);
+  x[3] = _mm_sub_epi32(x[2], s[3]);
+
+  x[0] = RightShiftWithRounding_S32(x[0], 12);
+  x[1] = RightShiftWithRounding_S32(x[1], 12);
+  x[2] = RightShiftWithRounding_S32(s[2], 12);
+  x[3] = RightShiftWithRounding_S32(x[3], 12);
+
+  x[0] = _mm_packs_epi32(x[0], x[1]);
+  x[2] = _mm_packs_epi32(x[2], x[3]);
+  x[1] = _mm_srli_si128(x[0], 8);
+  x[3] = _mm_srli_si128(x[2], 8);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x4To4x8_U16(x, output);
+      StoreDst<8, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 4>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      Transpose4x4_U16(x, x);
+    }
+    StoreDst<8, 4>(dst, step, 0, x);
+  }
+}
+
+constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
+                                               3344, 0, 2482, 1321};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src =
+      _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+  const __m128i v_kAdst4DcOnlyMultipliers =
+      LoadUnaligned16(kAdst4DcOnlyMultiplier);
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  // +
+  // s0*0  s0*0  s0*0  s0*k0
+  const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers);
+  const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i c = _mm_packs_epi32(b, b);
+  StoreLo8(dst, c);
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
+    const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
+    const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
+    const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
+    const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src);
+    const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src);
+    const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src);
+    const __m128i x0 = s0;
+    const __m128i x1 = s1;
+    const __m128i x2 = s2;
+    const __m128i x3 = _mm_add_epi32(s0, s1);
+    const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12);
+    const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12);
+    const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12);
+    const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12);
+    const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1);
+    const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3);
+    StoreLo8(&dst[i], dst_0_1);
+    StoreHi8(&dst[i + width * 1], dst_0_1);
+    StoreLo8(&dst[i + width * 2], dst_2_3);
+    StoreHi8(&dst[i + width * 3], dst_2_3);
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[8], x[8];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+    } else {
+      LoadSrc<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i input[8];
+      LoadSrc<16, 8>(dst, step, 0, input);
+      Transpose8x8_U16(input, x);
+    } else {
+      LoadSrc<16, 8>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[3], &s[7], false);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+    } else {
+      StoreDst<8, 8>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      __m128i output[8];
+      Transpose8x8_U16(x, output);
+      StoreDst<16, 8>(dst, step, 0, output);
+    } else {
+      StoreDst<16, 8>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  __m128i x[8];
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[4]);
+  x[2] = s[6];
+  x[3] = _mm_subs_epi16(v_zero, s[2]);
+  x[4] = s[3];
+  x[5] = _mm_subs_epi16(v_zero, s[7]);
+  x[6] = s[5];
+  x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+  const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]);
+  const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]);
+  const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]);
+  const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]);
+  const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+  const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+  const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+  StoreUnaligned16(dst, _mm_packs_epi32(b, b1));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[8];
+
+  int i = 0;
+  do {
+    const __m128i v_src = LoadLo8(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    __m128i x[8];
+    const __m128i v_zero = _mm_setzero_si128();
+    x[0] = s[0];
+    x[1] = _mm_subs_epi16(v_zero, s[4]);
+    x[2] = s[6];
+    x[3] = _mm_subs_epi16(v_zero, s[2]);
+    x[4] = s[3];
+    x[5] = _mm_subs_epi16(v_zero, s[7]);
+    x[6] = s[5];
+    x[7] = _mm_subs_epi16(v_zero, s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+                                         bool transpose) {
+  auto* const dst = static_cast<int16_t*>(dest);
+  __m128i s[16], x[16];
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i input[4];
+      LoadSrc<16, 4>(dst, step, 0, input);
+      Transpose8x4To4x8_U16(input, x);
+      LoadSrc<16, 4>(dst, step, 8, input);
+      Transpose8x4To4x8_U16(input, &x[8]);
+    } else {
+      LoadSrc<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i input[8];
+        LoadSrc<16, 8>(dst, step, idx, input);
+        Transpose8x8_U16(input, &x[idx]);
+      }
+    } else {
+      LoadSrc<16, 16>(dst, step, 0, x);
+    }
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false);
+  HadamardRotation(&s[1], &s[9], false);
+  HadamardRotation(&s[2], &s[10], false);
+  HadamardRotation(&s[3], &s[11], false);
+  HadamardRotation(&s[4], &s[12], false);
+  HadamardRotation(&s[5], &s[13], false);
+  HadamardRotation(&s[6], &s[14], false);
+  HadamardRotation(&s[7], &s[15], false);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false);
+  HadamardRotation(&s[8], &s[12], false);
+  HadamardRotation(&s[1], &s[5], false);
+  HadamardRotation(&s[9], &s[13], false);
+  HadamardRotation(&s[2], &s[6], false);
+  HadamardRotation(&s[10], &s[14], false);
+  HadamardRotation(&s[3], &s[7], false);
+  HadamardRotation(&s[11], &s[15], false);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false);
+  HadamardRotation(&s[4], &s[6], false);
+  HadamardRotation(&s[8], &s[10], false);
+  HadamardRotation(&s[12], &s[14], false);
+  HadamardRotation(&s[1], &s[3], false);
+  HadamardRotation(&s[5], &s[7], false);
+  HadamardRotation(&s[9], &s[11], false);
+  HadamardRotation(&s[13], &s[15], false);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+
+  if (stage_is_rectangular) {
+    if (transpose) {
+      __m128i output[4];
+      Transpose4x8To8x4_U16(x, output);
+      StoreDst<16, 4>(dst, step, 0, output);
+      Transpose4x8To8x4_U16(&x[8], output);
+      StoreDst<16, 4>(dst, step, 8, output);
+    } else {
+      StoreDst<8, 16>(dst, step, 0, x);
+    }
+  } else {
+    if (transpose) {
+      for (int idx = 0; idx < 16; idx += 8) {
+        __m128i output[8];
+        Transpose8x8_U16(&x[idx], output);
+        StoreDst<16, 8>(dst, step, idx, output);
+      }
+    } else {
+      StoreDst<16, 16>(dst, step, 0, x);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  const __m128i v_zero = _mm_setzero_si128();
+  x[0] = s[0];
+  x[1] = _mm_subs_epi16(v_zero, s[8]);
+  x[2] = s[12];
+  x[3] = _mm_subs_epi16(v_zero, s[4]);
+  x[4] = s[6];
+  x[5] = _mm_subs_epi16(v_zero, s[14]);
+  x[6] = s[10];
+  x[7] = _mm_subs_epi16(v_zero, s[2]);
+  x[8] = s[3];
+  x[9] = _mm_subs_epi16(v_zero, s[11]);
+  x[10] = s[15];
+  x[11] = _mm_subs_epi16(v_zero, s[7]);
+  x[12] = s[5];
+  x[13] = _mm_subs_epi16(v_zero, s[13]);
+  x[14] = s[9];
+  x[15] = _mm_subs_epi16(v_zero, s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  __m128i s[16];
+  __m128i x[16];
+
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+  // stage 1.
+  s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 2; ++i) {
+    const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]);
+    const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]);
+    const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]);
+    const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]);
+    const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2);
+    const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6);
+
+    const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+    const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+    const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add);
+    const __m128i a1 =
+        _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add);
+    const __m128i b = _mm_sra_epi32(a, v_row_shift);
+    const __m128i b1 = _mm_sra_epi32(a1, v_row_shift);
+    StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1));
+  }
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  int i = 0;
+  do {
+    __m128i s[16];
+    __m128i x[16];
+    const __m128i v_src = LoadUnaligned16(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      StoreLo8(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+template <bool is_row_shift>
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  if (is_row_shift) {
+    const int shift = 1;
+    const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+    const __m128i v_multiplier_one =
+        _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+      const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
+      const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+      const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one);
+      const __m128i b = _mm_srai_epi32(a, 12 + shift);
+      const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift);
+      StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi));
+    }
+  } else {
+    const __m128i v_multiplier =
+        _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
+    for (int i = 0; i < 4; i += 2) {
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
+      const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i b = _mm_adds_epi16(a, v_src);
+      StoreUnaligned16(&dst[i * step], b);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask);
+
+  const int shift = (tx_height < 16) ? 0 : 1;
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
+  const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one);
+  const __m128i b = _mm_srai_epi32(a, 12 + shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult =
+            _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+
+  const __m128i v_multiplier_fraction =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 3));
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src);
+      const __m128i v_src_mult2 =
+          _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+      const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+      const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+      const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_adds_epi16(frame_data16, b);
+      Store4(dst, _mm_packus_epi16(c, c));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_round =
+            _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier);
+        const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round);
+        const __m128i v_src_mult2 =
+            _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data);
+        const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row);
+        const __m128i a = _mm_adds_epi16(v_dst_col, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_adds_epi16(frame_data16, b);
+        StoreLo8(dst + j, _mm_packus_epi16(c, c));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
+    StoreUnaligned16(&dst[h * step], v_src_mult);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+    // saturating add here is ok.
+    const __m128i a = _mm_adds_epi16(v_src, v_src);
+    StoreUnaligned16(&dst[h * step], a);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src =
+      _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask));
+  const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src);
+  const __m128i v_row_shift_add = _mm_set1_epi32(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add);
+  const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add);
+  const __m128i b = _mm_sra_epi32(a, v_row_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      const __m128i v_src = LoadLo8(&source[row]);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      const __m128i frame_data = Load4(dst);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+                                                int shift) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+
+  for (int h = 0; h < 4; ++h) {
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
+    const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
+    const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
+    const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
+    const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2);
+    const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one);
+    const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one);
+    const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one);
+    const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one);
+    const __m128i shift0 = _mm_sra_epi32(madd0, v_shift);
+    const __m128i shift1 = _mm_sra_epi32(madd1, v_shift);
+    const __m128i shift20 = _mm_sra_epi32(madd20, v_shift);
+    const __m128i shift21 = _mm_sra_epi32(madd21, v_shift);
+    StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1));
+    StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src_round0 =
+      _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+  const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask);
+  const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
+  const __m128i v_multiplier_one =
+      _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001);
+  const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
+  const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
+  const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
+  const __m128i b = _mm_sra_epi32(a, v_shift);
+  dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const __m128i v_multiplier =
+      _mm_set1_epi16(static_cast<int16_t>(kIdentity4MultiplierFraction << 4));
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const __m128i v_src = LoadLo8(&source[i * tx_width]);
+      const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+      const __m128i frame_data = Load4(dst);
+      const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+      const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        const __m128i v_src = LoadUnaligned16(&source[row + j]);
+        const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier);
+        const __m128i frame_data = LoadLo8(dst + j);
+        const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src);
+        const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2);
+        const __m128i a = _mm_adds_epi16(v_dst_i, v_eight);
+        const __m128i b = _mm_srai_epi16(a, 4);
+        const __m128i c = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d = _mm_adds_epi16(c, b);
+        StoreLo8(dst + j, _mm_packus_epi16(d, d));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
+                                                  const int32_t step) {
+  auto* const dst = static_cast<int16_t*>(dest);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  for (int h = 0; h < 4; ++h) {
+    for (int i = 0; i < 32; i += 8) {
+      const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
+      // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
+      // saturating add here is ok.
+      const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
+      StoreUnaligned16(&dst[h * step + i], v_dst_i);
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int16_t*>(dest);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
+
+  // When combining the identity32 multiplier with the row shift, the
+  // calculation for tx_height equal to 16 can be simplified from
+  // ((A * 4) + 1) >> 1) to (A * 2).
+  const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src);
+  dst[0] = _mm_extract_epi16(v_dst_0, 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source) {
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  const __m128i v_two = _mm_set1_epi16(2);
+
+  int i = 0;
+  do {
+    const int row = i * tx_width;
+    int j = 0;
+    do {
+      const __m128i v_dst_i = LoadUnaligned16(&source[row + j]);
+      const __m128i frame_data = LoadLo8(dst + j);
+      const __m128i a = _mm_adds_epi16(v_dst_i, v_two);
+      const __m128i b = _mm_srai_epi16(a, 2);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      StoreLo8(dst + j, _mm_packus_epi16(d, d));
+      j += 8;
+    } while (j < tx_width);
+    dst += stride;
+  } while (++i < tx_height);
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
+                                       const int start_x, const int start_y,
+                                       const void* LIBGAV1_RESTRICT source,
+                                       const int adjusted_tx_height) {
+  const auto* const src = static_cast<const int16_t*>(source);
+  __m128i s[4], x[4];
+
+  if (adjusted_tx_height == 1) {
+    // Special case: only src[0] is nonzero.
+    //   src[0]  0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //       0   0   0   0
+    //
+    // After the row and column transforms are applied, we have:
+    //       f   h   h   h
+    //       g   i   i   i
+    //       g   i   i   i
+    //       g   i   i   i
+    // where f, g, h, i are computed as follows.
+    int16_t f = (src[0] >> 2) - (src[0] >> 3);
+    const int16_t g = f >> 1;
+    f = f - (f >> 1);
+    const int16_t h = (src[0] >> 3) - (src[0] >> 4);
+    const int16_t i = (src[0] >> 4);
+    s[0] = _mm_set1_epi16(h);
+    s[0] = _mm_insert_epi16(s[0], f, 0);
+    s[1] = _mm_set1_epi16(i);
+    s[1] = _mm_insert_epi16(s[1], g, 0);
+    s[2] = s[3] = s[1];
+  } else {
+    x[0] = LoadLo8(&src[0 * 4]);
+    x[2] = LoadLo8(&src[1 * 4]);
+    x[3] = LoadLo8(&src[2 * 4]);
+    x[1] = LoadLo8(&src[3 * 4]);
+
+    // Row transforms.
+    Transpose4x4_U16(x, x);
+    s[0] = _mm_srai_epi16(x[0], 2);
+    s[2] = _mm_srai_epi16(x[1], 2);
+    s[3] = _mm_srai_epi16(x[2], 2);
+    s[1] = _mm_srai_epi16(x[3], 2);
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    __m128i e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+    Transpose4x4_U16(s, s);
+
+    // Column transforms.
+    s[0] = _mm_add_epi16(s[0], s[2]);
+    s[3] = _mm_sub_epi16(s[3], s[1]);
+    e = _mm_sub_epi16(s[0], s[3]);
+    e = _mm_srai_epi16(e, 1);
+    s[1] = _mm_sub_epi16(e, s[1]);
+    s[2] = _mm_sub_epi16(e, s[2]);
+    s[0] = _mm_sub_epi16(s[0], s[1]);
+    s[3] = _mm_add_epi16(s[3], s[2]);
+  }
+
+  // Store to frame.
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  for (int row = 0; row < 4; ++row) {
+    const __m128i frame_data = Load4(dst);
+    const __m128i a = _mm_cvtepu8_epi16(frame_data);
+    const __m128i b = _mm_add_epi16(a, s[row]);
+    Store4(dst, _mm_packus_epi16(b, b));
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint8_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height,
+    const int16_t* LIBGAV1_RESTRICT source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const __m128i v_eight = _mm_set1_epi16(8);
+  const int stride = frame.columns();
+  uint8_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const __m128i residual = LoadLo8(&source[row]);
+      const __m128i frame_data = Load4(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i a = _mm_adds_epi16(residual, v_eight);
+      const __m128i b = _mm_srai_epi16(a, 4);
+      const __m128i c = _mm_cvtepu8_epi16(frame_data);
+      const __m128i d = _mm_adds_epi16(c, b);
+      Store4(dst, _mm_packus_epi16(d, d));
+      dst += stride;
+    }
+  } else if (tx_width == 8) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8;
+      const __m128i residual = LoadUnaligned16(&source[row]);
+      const __m128i frame_data = LoadLo8(dst);
+      // Saturate to prevent overflowing int16_t
+      const __m128i b = _mm_adds_epi16(residual, v_eight);
+      const __m128i c = _mm_srai_epi16(b, 4);
+      const __m128i d = _mm_cvtepu8_epi16(frame_data);
+      const __m128i e = _mm_adds_epi16(d, c);
+      StoreLo8(dst, _mm_packus_epi16(e, e));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const __m128i residual = LoadUnaligned16(&source[row + j]);
+        const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]);
+        const __m128i frame_data = LoadUnaligned16(frame[y] + x);
+        const __m128i b = _mm_adds_epi16(residual, v_eight);
+        const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight);
+        const __m128i c = _mm_srai_epi16(b, 4);
+        const __m128i c_hi = _mm_srai_epi16(b_hi, 4);
+        const __m128i d = _mm_cvtepu8_epi16(frame_data);
+        const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8));
+        const __m128i e = _mm_adds_epi16(d, c);
+        const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi);
+        StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi));
+        j += 16;
+      } while (j < tx_width);
+    }
+  }
+}
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) {
+  const __m128i word_reverse_8 =
+      _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // read 16 shorts
+      const __m128i v3210 = LoadUnaligned16(&source[i]);
+      const __m128i v7654 = LoadUnaligned16(&source[i + 8]);
+      const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8);
+      const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8);
+      StoreUnaligned16(&source[i], v4567);
+      StoreUnaligned16(&source[i + 8], v0123);
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, word_reverse_8);
+      StoreUnaligned16(&source[i], b);
+    }
+  } else {
+    const __m128i dual_word_reverse_4 =
+        _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706);
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4);
+      StoreUnaligned16(&source[i], b);
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) {
+  const __m128i v_kTransformRowMultiplier =
+      _mm_set1_epi16(kTransformRowMultiplier << 3);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i a = LoadUnaligned16(&source[i]);
+      const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+      StoreUnaligned16(&source[i], b);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      // The last 32 values of every row are always zero if the |tx_width| is
+      // 64.
+      const int non_zero_width = (tx_width < 64) ? tx_width : 32;
+      int j = 0;
+      do {
+        const __m128i a = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier);
+        StoreUnaligned16(&source[i * tx_width + j], b);
+        j += 8;
+      } while (j < non_zero_width);
+    } while (++i < num_rows);
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows,
+                                    int row_shift) {
+  const __m128i v_row_shift_add = _mm_set1_epi16(row_shift);
+  const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add);
+  if (tx_width == 4) {
+    // Process two rows per iteration.
+    int i = 0;
+    do {
+      const __m128i residual = LoadUnaligned16(&source[i]);
+      const __m128i shifted_residual =
+          ShiftResidual(residual, v_row_shift_add, v_row_shift);
+      StoreUnaligned16(&source[i], shifted_residual);
+      i += 8;
+    } while (i < tx_width * num_rows);
+  } else {
+    int i = 0;
+    do {
+      for (int j = 0; j < tx_width; j += 8) {
+        const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]);
+        const __m128i shifted_residual =
+            ShiftResidual(residual, v_row_shift_add, v_row_shift);
+        StoreUnaligned16(&source[i * tx_width + j], shifted_residual);
+      }
+    } while (++i < num_rows);
+  }
+}
+
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+                                             /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct4 columns in parallel.
+      Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
+                                              /*transpose=*/false);
+    } else {
+      // Process 8 1d dct4 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
+                                               /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
+}
+
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                              /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct8 columns in parallel.
+      Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d dct8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
+}
+
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d dct16 columns in parallel.
+      Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d dct16 columns in parallel per iteration.
+        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
+}
+
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
+}
+
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
+      i += 8;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
+}
+
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+    i += 4;
+  } while (i < adjusted_tx_height);
+
+  if (row_shift != 0) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = 0;
+    do {
+      Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
+      i += 4;
+    } while (i < tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 4, src, tx_type);
+}
+
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height,
+                                     void* LIBGAV1_RESTRICT src_buffer,
+                                     int start_x, int start_y,
+                                     void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst8 columns in parallel.
+      Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      // Process 8 1d adst8 columns in parallel per iteration.
+      int i = 0;
+      do {
+        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                 /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 8, src, tx_type);
+}
+
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                   TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int /*start_x*/, int /*start_y*/,
+                                   void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                                /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height,
+                                      void* LIBGAV1_RESTRICT src_buffer,
+                                      int start_x, int start_y,
+                                      void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    if (tx_width == 4) {
+      // Process 4 1d adst16 columns in parallel.
+      Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
+    } else {
+      int i = 0;
+      do {
+        // Process 8 1d adst16 columns in parallel per iteration.
+        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
+                                                  /*transpose=*/false);
+        i += 8;
+      } while (i < tx_width);
+    }
+  }
+  StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                   tx_width, 16, src, tx_type);
+}
+
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  } else {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* LIBGAV1_RESTRICT src_buffer,
+                                         int start_x, int start_y,
+                                         void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                              adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = 0;
+    do {
+      Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+      i += 4;
+    } while (i < adjusted_tx_height);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = 0;
+  do {
+    Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* LIBGAV1_RESTRICT src_buffer,
+                                         int start_x, int start_y,
+                                         void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                     adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = 0;
+  do {
+    Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+                         kTransformRowShift[tx_size]);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* LIBGAV1_RESTRICT src_buffer,
+                                          int start_x, int start_y,
+                                          void* LIBGAV1_RESTRICT dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                      adjusted_tx_height, src);
+}
+
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
+  const int tx_height = kTransformHeight[tx_size];
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
+    return;
+  }
+
+  // Process kTransformSize32x16. The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = 0;
+  do {
+    Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* LIBGAV1_RESTRICT src_buffer,
+                                          int start_x, int start_y,
+                                          void* LIBGAV1_RESTRICT dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                               adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+                                 int /*adjusted_tx_height*/,
+                                 void* /*src_buffer*/, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+  // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height,
+                                    void* LIBGAV1_RESTRICT src_buffer,
+                                    int start_x, int start_y,
+                                    void* LIBGAV1_RESTRICT dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Do both row and column transforms in the column-transform pass.
+  // Process 4 1d wht4 rows and columns in parallel.
+  const auto* src = static_cast<int16_t*>(src_buffer);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
+}
+
+//------------------------------------------------------------------------------
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  // Maximum transform size for Dct is 64.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
+      Dct4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
+      Dct4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
+      Dct8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
+      Dct8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
+      Dct16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
+      Dct16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
+      Dct32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
+      Dct32TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize64_Transform1dDct)
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
+      Dct64TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
+      Dct64TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Adst is 16.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
+      Adst4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
+      Adst4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
+      Adst8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
+      Adst8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dAdst)
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
+      Adst16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
+      Adst16TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Identity transform is 32.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
+      Identity4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
+      Identity4TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize8_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
+      Identity8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
+      Identity8TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize16_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
+      Identity16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
+      Identity16TransformLoopColumn_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize32_Transform1dIdentity)
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+      Identity32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+      Identity32TransformLoopColumn_SSE4_1;
+#endif
+
+  // Maximum transform size for Wht is 4.
+#if DSP_ENABLED_8BPP_SSE4_1(Transform1dSize4_Transform1dWht)
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+      Wht4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+      Wht4TransformLoopColumn_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h
new file mode 100644
index 0000000..c31e88b
--- /dev/null
+++ b/src/dsp/x86/inverse_transform_sse4.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::inverse_transforms, see the defines below for specifics.
+// This function is not thread-safe.
+void InverseTransformInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct
+#define LIBGAV1_Dsp8bpp_Transform1dSize64_Transform1dDct LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dAdst LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize8_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize16_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity
+#define LIBGAV1_Dsp8bpp_Transform1dSize32_Transform1dIdentity LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht
+#define LIBGAV1_Dsp8bpp_Transform1dSize4_Transform1dWht LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc
new file mode 100644
index 0000000..b9da2d5
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.cc
@@ -0,0 +1,2252 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1,
+                              const __m128i& a2, const __m128i& s1,
+                              const __m128i& s2) {
+  __m128i x = _mm_add_epi16(a1, total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2);
+  return x;
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  const __m128i fe = _mm_set1_epi8(static_cast<int8_t>(0xfe));
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1);
+  const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4));
+  return _mm_subs_epu8(c, outer_thresh);
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4));
+  const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq);
+  const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh);
+  const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1);
+  return hev_mask;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */
+  return _mm_packs_epi16(e, e);
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi8(a, b);
+  const __m128i d = _mm_unpacklo_epi8(c, c);
+  const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */
+  return _mm_packs_epi16(e, e);
+}
+
+//------------------------------------------------------------------------------
+// 4-tap filters
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev) {
+  const __m128i t80 = _mm_set1_epi8(static_cast<int8_t>(0x80));
+  const __m128i t1 = _mm_set1_epi8(0x1);
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80);
+  const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09);
+  const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c);
+  const __m128i _hev = _mm_unpacklo_epi32(hev, hev);
+  const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0);
+  __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev);
+
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_adds_epi8(a, x);
+  a = _mm_and_si128(a, mask);
+  a = _mm_unpacklo_epi32(a, a);
+
+  const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303);
+  const __m128i a1a2 = AddShift3(a, t4t3);
+  const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55);
+  const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1));
+  // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1
+  const __m128i adjust_sign_for_add =
+      _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1));
+
+  const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3);
+  const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add);
+
+  const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2);
+  const __m128i c = _mm_xor_si128(b, t80);
+
+  *oqp0 = c;
+  *oqp1 = _mm_srli_si128(c, 8);
+}
+
+void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0);
+
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose4x4(const __m128i& x0, const __m128i& x1,
+                         const __m128i& x2, const __m128i& x3, __m128i* d0,
+                         __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0   00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // output
+  // d0   00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d1   01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d2   02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // d3   03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  *d0 = _mm_unpacklo_epi16(w0, w1);
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(*d0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0, 12);
+}
+
+void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = Load4(dst - 2 + 0 * stride);
+  __m128i x1 = Load4(dst - 2 + 1 * stride);
+  __m128i x2 = Load4(dst - 2 + 2 * stride);
+  __m128i x3 = Load4(dst - 2 + 3 * stride);
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i d0 = _mm_unpacklo_epi16(w0, w1);
+  const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc);
+  const __m128i qp0 = _mm_srli_si128(d0, 4);
+  const __m128i q1q0 = _mm_srli_si128(d0, 8);
+  const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i p1 = oqp1;
+  const __m128i p0 = oqp0;
+  const __m128i q0 = _mm_srli_si128(oqp0, 4);
+  const __m128i q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 4);
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0, 8);
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0, 12);
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 4);
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1, 8);
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1, 12);
+}
+
+void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 3 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 3 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 3 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3);
+
+  Store4(dst - 2 + 0 * stride, x0);
+  Store4(dst - 2 + 1 * stride, x1);
+  Store4(dst - 2 + 2 * stride, x2);
+  Store4(dst - 2 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+
+inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi8(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu8(
+      _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi8(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                 int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3,
+                              const __m128i& x4, const __m128i& x5,
+                              const __m128i& x6, const __m128i& x7, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  const __m128i w2 = _mm_unpacklo_epi8(x4, x5);
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  const __m128i w3 = _mm_unpacklo_epi8(x6, x7);
+
+  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+
+  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d0 = _mm_unpacklo_epi32(w4, w5);
+  *d1 = _mm_srli_si128(*d0, 8);
+  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d2 = _mm_unpackhi_epi32(w4, w5);
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh,
+               int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadLo8(dst - 4 + 0 * stride);
+  __m128i x1 = LoadLo8(dst - 4 + 1 * stride);
+  __m128i x2 = LoadLo8(dst - 4 + 2 * stride);
+  __m128i x3 = LoadLo8(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 4);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 4);
+  q1 = _mm_srli_si128(oqp1, 4);
+
+  Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreLo8(dst - 4 + 0 * stride, x0);
+  StoreLo8(dst - 4 + 1 * stride, x1);
+  StoreLo8(dst - 4 + 2 * stride, x2);
+  StoreLo8(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6);
+  const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5);
+  const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4);
+  const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3);
+  const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2);
+  const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1);
+  const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0);
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+  *oqp5 = _mm_packus_epi16(*oqp5, *oqp5);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+  *oqp4 = _mm_packus_epi16(*oqp4, *oqp4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+  *oqp3 = _mm_packus_epi16(*oqp3, *oqp3);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+  *oqp2 = _mm_packus_epi16(*oqp2, *oqp2);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+  *oqp1 = _mm_packus_epi16(*oqp1, *oqp1);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+  *oqp0 = _mm_packus_epi16(*oqp0, *oqp0);
+}
+
+void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                  int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  const __m128i p3 = Load4(dst - 4 * stride);
+  const __m128i p2 = Load4(dst - 3 * stride);
+  const __m128i p1 = Load4(dst - 2 * stride);
+  const __m128i p0 = Load4(dst - 1 * stride);
+  const __m128i q0 = Load4(dst + 0 * stride);
+  const __m128i q1 = Load4(dst + 1 * stride);
+  const __m128i q2 = Load4(dst + 2 * stride);
+  const __m128i q3 = Load4(dst + 3 * stride);
+
+  const __m128i qp3 = _mm_unpacklo_epi32(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi32(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi32(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi32(p0, q0);
+  const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1);
+  const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i p6 = Load4(dst - 7 * stride);
+    const __m128i p5 = Load4(dst - 6 * stride);
+    const __m128i p4 = Load4(dst - 5 * stride);
+    const __m128i q4 = Load4(dst + 4 * stride);
+    const __m128i q5 = Load4(dst + 5 * stride);
+    const __m128i q6 = Load4(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi32(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi32(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi32(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      Store4(dst - 6 * stride, oqp5_f14);
+      Store4(dst - 5 * stride, oqp4_f14);
+      Store4(dst - 4 * stride, oqp3_f14);
+      Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4));
+      Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4));
+      Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4));
+    }
+
+    Store4(dst - 3 * stride, oqp2_f8);
+    Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4));
+  }
+
+  Store4(dst - 2 * stride, oqp1);
+  Store4(dst - 1 * stride, oqp0);
+  Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4));
+  Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4));
+}
+
+// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8,
+// then unpacked to the correct qp register. (qp7 - qp0)
+//
+// p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+//
+// 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+// 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+// 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+// 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                                  const __m128i& x2, const __m128i& x3,
+                                  __m128i* q0p0, __m128i* q1p1, __m128i* q2p2,
+                                  __m128i* q3p3, __m128i* q4p4, __m128i* q5p5,
+                                  __m128i* q6p6, __m128i* q7p7) {
+  // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  const __m128i w0 = _mm_unpacklo_epi8(x0, x1);
+  // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  const __m128i w1 = _mm_unpacklo_epi8(x2, x3);
+  // 08 18 09 19 0a 1a 0b 1b  0c 1c 0d 1d 0e 1e 0f 1f
+  const __m128i w2 = _mm_unpackhi_epi8(x0, x1);
+  // 28 38 29 39 2a 3a 2b 3b  2c 3c 2d 3d 2e 3e 2f 3f
+  const __m128i w3 = _mm_unpackhi_epi8(x2, x3);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  const __m128i ww0 = _mm_unpacklo_epi16(w0, w1);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  const __m128i ww1 = _mm_unpackhi_epi16(w0, w1);
+  // 08 18 28 38 09 19 29 39  0a 1a 2a 3a 0b 1b 2b 3b
+  const __m128i ww2 = _mm_unpacklo_epi16(w2, w3);
+  // 0c 1c 2c 3c 0d 1d 2d 3d  0e 1e 2e 3e 0f 1f 2f 3f
+  const __m128i ww3 = _mm_unpackhi_epi16(w2, w3);
+  // 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12));
+  // 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3);
+  // 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4));
+  // 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3);
+  // 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12));
+  // 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2);
+  // 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4));
+  // 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2);
+}
+
+inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6,
+                                  const __m128i& qp5, const __m128i& qp4,
+                                  const __m128i& qp3, const __m128i& qp2,
+                                  const __m128i& qp1, const __m128i& qp0,
+                                  __m128i* x0, __m128i* x1, __m128i* x2,
+                                  __m128i* x3) {
+  // qp7: 00 10 20 30  0f 1f 2f 3f  xx xx xx xx xx xx xx xx
+  // qp6: 01 11 21 31  0e 1e 2e 3e  xx xx xx xx xx xx xx xx
+  // qp5: 02 12 22 32  0d 1d 2d 3d  xx xx xx xx xx xx xx xx
+  // qp4: 03 13 23 33  0c 1c 2c 3c  xx xx xx xx xx xx xx xx
+  // qp3: 04 14 24 34  0b 1b 2b 3b  xx xx xx xx xx xx xx xx
+  // qp2: 05 15 25 35  0a 1a 2a 3a  xx xx xx xx xx xx xx xx
+  // qp1: 06 16 26 36  09 19 29 39  xx xx xx xx xx xx xx xx
+  // qp0: 07 17 27 37  08 18 28 38  xx xx xx xx xx xx xx xx
+
+  // 00 01 10 11 20 21 30 31  0f 0e 1f 1e 2f 2e 3f 3e
+  const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6);
+  // 02 03 12 13 22 23 32 33  xx xx xx xx xx xx xx xx
+  const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4);
+  // 04 05 14 15 24 25 34 35  xx xx xx xx xx xx xx xx
+  const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2);
+  // 06 07 16 17 26 27 36 37  xx xx xx xx xx xx xx xx
+  const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0);
+  // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+  const __m128i w4 = _mm_unpacklo_epi16(w0, w1);
+  // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37
+  const __m128i w5 = _mm_unpacklo_epi16(w2, w3);
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i d0 = _mm_unpacklo_epi32(w4, w5);
+  // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+  const __m128i d2 = _mm_unpackhi_epi32(w4, w5);
+  // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39
+  const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1);
+  // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b
+  const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3);
+  // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d
+  const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5);
+  // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f
+  const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7);
+  // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b
+  const __m128i w14 = _mm_unpackhi_epi16(w10, w11);
+  // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f
+  const __m128i w15 = _mm_unpackhi_epi16(w12, w13);
+  // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f
+  const __m128i d1 = _mm_unpacklo_epi32(w14, w15);
+  // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f
+  const __m128i d3 = _mm_unpackhi_epi32(w14, w15);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+  *x0 = _mm_unpacklo_epi64(d0, d1);
+  // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
+  *x1 = _mm_unpackhi_epi64(d0, d1);
+  // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
+  *x2 = _mm_unpacklo_epi64(d2, d3);
+  // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
+  *x3 = _mm_unpackhi_epi64(d2, d3);
+}
+
+void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                int inner_thresh, int hev_thresh) {
+  auto* const dst = static_cast<uint8_t*>(dest);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i v_flat_thresh = _mm_set1_epi8(1);
+  const __m128i v_outer_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero);
+  const __m128i v_inner_thresh =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero);
+  const __m128i v_hev_thresh0 =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero);
+  const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero);
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0;
+
+  DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5,
+                        &qp6, &qp7);
+
+  const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d);
+  const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0,
+                                            v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask =
+      _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask =
+        _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2,
+                        &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Horizontal14;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14;
+#endif
+}
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+namespace high_bitdepth {
+namespace {
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+struct LoopFilterFuncs_SSE4_1 {
+  LoopFilterFuncs_SSE4_1() = delete;
+
+  static constexpr int kThreshShift = bitdepth - 8;
+
+  static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh,
+                        int inner_thresh, int hev_thresh);
+  static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh,
+                          int inner_thresh, int hev_thresh);
+  static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh,
+                         int inner_thresh, int hev_thresh);
+  static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh,
+                           int inner_thresh, int hev_thresh);
+};
+
+inline __m128i Clamp(const __m128i& min, const __m128i& max,
+                     const __m128i& val) {
+  const __m128i a = _mm_min_epi16(val, max);
+  const __m128i b = _mm_max_epi16(a, min);
+  return b;
+}
+
+inline __m128i AddShift3(const __m128i& a, const __m128i& b,
+                         const __m128i& vmin, const __m128i& vmax) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i d = Clamp(vmin, vmax, c);
+  const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */
+  return e;
+}
+
+inline __m128i AddShift1(const __m128i& a, const __m128i& b) {
+  const __m128i c = _mm_adds_epi16(a, b);
+  const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */
+  return e;
+}
+
+inline __m128i AbsDiff(const __m128i& a, const __m128i& b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+inline __m128i Hev(const __m128i& qp1, const __m128i& qp0,
+                   const __m128i& hev_thresh) {
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh);
+  return hev_mask;
+}
+
+inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i abs_pmq = AbsDiff(p1p0, q1q0);
+  const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq);
+  const __m128i b = _mm_srli_epi16(abs_pmq, 1);
+  const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8));
+  return _mm_subs_epu16(c, outer_thresh);
+}
+
+inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_abs_qp1mqp =
+      _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8));
+  const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1,
+                    __m128i* oqp0, const __m128i& mask, const __m128i& hev,
+                    int bitdepth) {
+  const __m128i t4 = _mm_set1_epi16(4);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t80 = _mm_set1_epi16(static_cast<int16_t>(1 << (bitdepth - 1)));
+  const __m128i t1 = _mm_set1_epi16(0x1);
+  const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80);
+  const __m128i vmax = _mm_subs_epi16(t80, t1);
+  const __m128i ps1 = _mm_subs_epi16(qp1, t80);
+  const __m128i ps0 = _mm_subs_epi16(qp0, t80);
+  const __m128i qs0 = _mm_srli_si128(ps0, 8);
+  const __m128i qs1 = _mm_srli_si128(ps1, 8);
+
+  __m128i a = _mm_subs_epi16(ps1, qs1);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_adds_epi16(a, x);
+  a = _mm_and_si128(Clamp(vmin, vmax, a), mask);
+
+  const __m128i a1 = AddShift3(a, t4, vmin, vmax);
+  const __m128i a2 = AddShift3(a, t3, vmin, vmax);
+  const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1));
+
+  const __m128i ops1 = _mm_adds_epi16(ps1, a3);
+  const __m128i ops0 = _mm_adds_epi16(ps0, a2);
+  const __m128i oqs0 = _mm_subs_epi16(qs0, a1);
+  const __m128i oqs1 = _mm_subs_epi16(qs1, a3);
+
+  __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1);
+  __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0);
+
+  oqps1 = Clamp(vmin, vmax, oqps1);
+  oqps0 = Clamp(vmin, vmax, oqps0);
+
+  *oqp1 = _mm_adds_epi16(oqps1, t80);
+  *oqp0 = _mm_adds_epi16(oqps0, t80);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal4(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i qp0 = LoadHi8(p0, dst + 0 * stride);
+  const __m128i qp1 = LoadHi8(p1, dst + 1 * stride);
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical4(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+  const __m128i x0 = LoadLo8(dst - 2 + 0 * stride);
+  const __m128i x1 = LoadLo8(dst - 2 + 1 * stride);
+  const __m128i x2 = LoadLo8(dst - 2 + 2 * stride);
+  const __m128i x3 = LoadLo8(dst - 2 + 3 * stride);
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 01 11 21 31   p0p1
+  const __m128i a = _mm_unpacklo_epi32(w0, w1);
+  const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e);
+  // 02 12 22 32 03 13 23 33   q1q0
+  const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1);
+  const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 5-tap (chroma) filters
+
+inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0,
+                                  const __m128i& outer_thresh) {
+  //  abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh;
+  const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1);
+  const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1);
+  return CheckOuterThreshF4(q1q0, p1p0, outer_thresh);
+}
+
+inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1,
+                            const __m128i& qp0, const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1,
+                       const __m128i& qp0, const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0,
+                    __m128i* oqp1, __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f6_lo;
+  f6_lo =
+      _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo));
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo);
+
+  f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p2 * 3 + p1 * 2 + p0 * 2 + q0
+  // q2 * 3 + q1 * 2 + q0 * 2 + p0
+  *oqp1 = _mm_srli_epi16(f6_lo, 3);
+
+  // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1
+  // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1
+  f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f6_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal6(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1,
+                              const __m128i& x2, const __m128i& x3, __m128i* d0,
+                              __m128i* d1, __m128i* d2, __m128i* d3,
+                              __m128i* d4, __m128i* d5, __m128i* d6,
+                              __m128i* d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07
+  // x1   10 11 12 13 14 15 16 17
+  // x2   20 21 22 23 24 25 26 27
+  // x3   30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 04 14 05 15 06 16 07 17
+  const __m128i w2 = _mm_unpackhi_epi16(x0, x1);
+  // 24 34 25 35 26 36 27 37
+  const __m128i w3 = _mm_unpackhi_epi16(x2, x3);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i ww0 = _mm_unpacklo_epi32(w0, w1);
+  // 04 14 24 34 05 15 25 35
+  const __m128i ww1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i ww2 = _mm_unpackhi_epi32(w0, w1);
+  // 06 16 26 36 07 17 27 37
+  const __m128i ww3 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 xx xx xx xx
+  *d0 = ww0;
+  // 01 11 21 31 xx xx xx xx
+  *d1 = _mm_srli_si128(ww0, 8);
+  // 02 12 22 32 xx xx xx xx
+  *d2 = ww2;
+  // 03 13 23 33 xx xx xx xx
+  *d3 = _mm_srli_si128(ww2, 8);
+  // 04 14 24 34 xx xx xx xx
+  *d4 = ww1;
+  // 05 15 25 35 xx xx xx xx
+  *d5 = _mm_srli_si128(ww1, 8);
+  // 06 16 26 36 xx xx xx xx
+  *d6 = ww3;
+  // 07 17 27 37 xx xx xx xx
+  *d7 = _mm_srli_si128(ww3, 8);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical6(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride);
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i z0, z1;  // not used
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1);
+
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp1_f6;
+    __m128i oqp0_f6;
+
+    Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6);
+
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask);
+  }
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1);
+  // 00 10 20 30 01 11 21 31
+  const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3);
+
+  StoreLo8(dst - 2 + 0 * stride, op0p1);
+  StoreHi8(dst - 2 + 1 * stride, op0p1);
+  StoreLo8(dst - 2 + 2 * stride, oq1q0);
+  StoreHi8(dst - 2 + 3 * stride, oq1q0);
+}
+
+//------------------------------------------------------------------------------
+// 7-tap filters
+inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2,
+                            const __m128i& qp1, const __m128i& qp0,
+                            const __m128i& outer_thresh,
+                            const __m128i& inner_thresh) {
+  const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh);
+  const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0);
+  const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2);
+  const __m128i inner_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_or_si128(outer_mask, inner_mask);
+  const __m128i b = _mm_cmpeq_epi16(a, zero);
+  return b;
+}
+
+inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2,
+                       const __m128i& qp1, const __m128i& qp0,
+                       const __m128i& flat_thresh) {
+  const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0);
+  const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0);
+  const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0);
+  const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0);
+  const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0);
+  const __m128i flat_mask = _mm_subs_epu16(
+      _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh);
+  // ~mask
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a = _mm_cmpeq_epi16(flat_mask, zero);
+  return a;
+}
+
+inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                    const __m128i& qp0, __m128i* oqp2, __m128i* oqp1,
+                    __m128i* oqp0) {
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f8_lo =
+      _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo));
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo);
+
+  f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo),
+                        _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0
+  // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0
+  *oqp2 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1
+  // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1
+  f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo);
+  *oqp1 = _mm_srli_epi16(f8_lo, 3);
+
+  // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2
+  // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2
+  f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo);
+  *oqp0 = _mm_srli_epi16(f8_lo, 3);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal8(void* dest,
+                                                   ptrdiff_t stride8,
+                                                   int outer_thresh,
+                                                   int inner_thresh,
+                                                   int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpacklo_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpacklo_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpacklo_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpacklo_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical8(void* dest, ptrdiff_t stride8,
+                                                 int outer_thresh,
+                                                 int inner_thresh,
+                                                 int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride);
+
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    p2 = oqp2_f8;
+    q2 = _mm_srli_si128(oqp2_f8, 8);
+  }
+
+  p1 = oqp1;
+  p0 = oqp0;
+  q0 = _mm_srli_si128(oqp0, 8);
+  q1 = _mm_srli_si128(oqp1, 8);
+
+  TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3);
+
+  StoreUnaligned16(dst - 4 + 0 * stride, x0);
+  StoreUnaligned16(dst - 4 + 1 * stride, x1);
+  StoreUnaligned16(dst - 4 + 2 * stride, x2);
+  StoreUnaligned16(dst - 4 + 3 * stride, x3);
+}
+
+//------------------------------------------------------------------------------
+// 13-tap filters
+
+inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4,
+                     const __m128i& qp3, const __m128i& qp2, const __m128i& qp1,
+                     const __m128i& qp0, __m128i* oqp5, __m128i* oqp4,
+                     __m128i* oqp3, __m128i* oqp2, __m128i* oqp1,
+                     __m128i* oqp0) {
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i qp6_lo = qp6;
+  const __m128i qp5_lo = qp5;
+  const __m128i qp4_lo = qp4;
+  const __m128i qp3_lo = qp3;
+  const __m128i qp2_lo = qp2;
+  const __m128i qp1_lo = qp1;
+  const __m128i qp0_lo = qp0;
+  const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e);
+  const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e);
+  const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e);
+  const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e);
+  const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e);
+  const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e);
+
+  __m128i f14_lo =
+      _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo),
+                         _mm_add_epi16(qp5_lo, qp4_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo),
+                         _mm_add_epi16(qp3_lo, qp2_lo));
+
+  f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo),
+                         _mm_add_epi16(qp0_lo, pq0_lo));
+
+  // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0
+  // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0
+  *oqp5 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1
+  // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1
+  f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo);
+  *oqp4 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2
+  // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2
+  f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo);
+  *oqp3 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3
+  // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3
+  f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo);
+  *oqp2 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4
+  // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4
+  f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo);
+  *oqp1 = _mm_srli_epi16(f14_lo, 4);
+
+  // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5
+  // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5
+  f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo);
+  *oqp0 = _mm_srli_epi16(f14_lo, 4);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Horizontal14(void* dest,
+                                                    ptrdiff_t stride8,
+                                                    int outer_thresh,
+                                                    int inner_thresh,
+                                                    int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  const __m128i p3 = LoadLo8(dst - 4 * stride);
+  const __m128i p2 = LoadLo8(dst - 3 * stride);
+  const __m128i p1 = LoadLo8(dst - 2 * stride);
+  const __m128i p0 = LoadLo8(dst - 1 * stride);
+  const __m128i q0 = LoadLo8(dst + 0 * stride);
+  const __m128i q1 = LoadLo8(dst + 1 * stride);
+  const __m128i q2 = LoadLo8(dst + 2 * stride);
+  const __m128i q3 = LoadLo8(dst + 3 * stride);
+  const __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  const __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  const __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  const __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i p6 = LoadLo8(dst - 7 * stride);
+    const __m128i p5 = LoadLo8(dst - 6 * stride);
+    const __m128i p4 = LoadLo8(dst - 5 * stride);
+    const __m128i q4 = LoadLo8(dst + 4 * stride);
+    const __m128i q5 = LoadLo8(dst + 5 * stride);
+    const __m128i q6 = LoadLo8(dst + 6 * stride);
+    const __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+    const __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+    const __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+
+      StoreLo8(dst - 6 * stride, oqp5_f14);
+      StoreLo8(dst - 5 * stride, oqp4_f14);
+      StoreLo8(dst - 4 * stride, oqp3_f14);
+
+      StoreHi8(dst + 3 * stride, oqp3_f14);
+      StoreHi8(dst + 4 * stride, oqp4_f14);
+      StoreHi8(dst + 5 * stride, oqp5_f14);
+    }
+
+    StoreLo8(dst - 3 * stride, oqp2_f8);
+    StoreHi8(dst + 2 * stride, oqp2_f8);
+  }
+
+  StoreLo8(dst - 2 * stride, oqp1);
+  StoreLo8(dst - 1 * stride, oqp0);
+  StoreHi8(dst + 0 * stride, oqp0);
+  StoreHi8(dst + 1 * stride, oqp1);
+}
+
+inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1,
+                                   const __m128i& x2, const __m128i& x3,
+                                   const __m128i& x4, const __m128i& x5,
+                                   const __m128i& x6, const __m128i& x7,
+                                   __m128i* d0, __m128i* d1, __m128i* d2,
+                                   __m128i* d3) {
+  // input
+  // x0 00 01 02 03 xx xx xx xx
+  // x1 10 11 12 13 xx xx xx xx
+  // x2 20 21 22 23 xx xx xx xx
+  // x3 30 31 32 33 xx xx xx xx
+  // x4 40 41 42 43 xx xx xx xx
+  // x5 50 51 52 53 xx xx xx xx
+  // x6 60 61 62 63 xx xx xx xx
+  // x7 70 71 72 73 xx xx xx xx
+  // output
+  // d0 00 10 20 30 40 50 60 70
+  // d1 01 11 21 31 41 51 61 71
+  // d2 02 12 22 32 42 52 62 72
+  // d3 03 13 23 33 43 53 63 73
+
+  // 00 10 01 11 02 12 03 13
+  const __m128i w0 = _mm_unpackhi_epi16(x0, x1);
+  // 20 30 21 31 22 32 23 33
+  const __m128i w1 = _mm_unpackhi_epi16(x2, x3);
+  // 40 50 41 51 42 52 43 53
+  const __m128i w2 = _mm_unpackhi_epi16(x4, x5);
+  // 60 70 61 71 62 72 63 73
+  const __m128i w3 = _mm_unpackhi_epi16(x6, x7);
+
+  // 00 10 20 30 01 11 21 31
+  const __m128i w4 = _mm_unpacklo_epi32(w0, w1);
+  // 40 50 60 70 41 51 61 71
+  const __m128i w5 = _mm_unpacklo_epi32(w2, w3);
+  // 02 12 22 32 03 13 23 33
+  const __m128i w6 = _mm_unpackhi_epi32(w0, w1);
+  // 42 52 62 72 43 53 63 73
+  const __m128i w7 = _mm_unpackhi_epi32(w2, w3);
+
+  // 00 10 20 30 40 50 60 70
+  *d0 = _mm_unpacklo_epi64(w4, w5);
+  // 01 11 21 31 41 51 61 71
+  *d1 = _mm_unpackhi_epi64(w4, w5);
+  // 02 12 22 32 42 52 62 72
+  *d2 = _mm_unpacklo_epi64(w6, w7);
+  // 03 13 23 33 43 53 63 73
+  *d3 = _mm_unpackhi_epi64(w6, w7);
+}
+
+template <int bitdepth>
+void LoopFilterFuncs_SSE4_1<bitdepth>::Vertical14(void* dest, ptrdiff_t stride8,
+                                                  int outer_thresh,
+                                                  int inner_thresh,
+                                                  int hev_thresh) {
+  auto* const dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t stride = stride8 / 2;
+  const __m128i v_flat_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0);
+  const __m128i v_outer_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0);
+  const __m128i v_inner_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0);
+  const __m128i v_hev_thresh =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0);
+
+  // p7 p6 p5 p4 p3 p2 p1 p0  q0 q1 q2 q3 q4 q5 q6 q7
+  //
+  // 00 01 02 03 04 05 06 07  08 09 0a 0b 0c 0d 0e 0f
+  // 10 11 12 13 14 15 16 17  18 19 1a 1b 1c 1d 1e 1f
+  // 20 21 22 23 24 25 26 27  28 29 2a 2b 2c 2d 2e 2f
+  // 30 31 32 33 34 35 36 37  38 39 3a 3b 3c 3d 3e 3f
+
+  __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride);
+  __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride);
+  __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride);
+  __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride);
+
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0;
+  __m128i q7, q6, q5, q4, q3, q2, q1, q0;
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0);
+
+  x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride);
+  x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride);
+  x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride);
+  x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride);
+
+  Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+
+  __m128i qp7 = _mm_unpacklo_epi64(p7, q7);
+  __m128i qp6 = _mm_unpacklo_epi64(p6, q6);
+  __m128i qp5 = _mm_unpacklo_epi64(p5, q5);
+  __m128i qp4 = _mm_unpacklo_epi64(p4, q4);
+  __m128i qp3 = _mm_unpacklo_epi64(p3, q3);
+  __m128i qp2 = _mm_unpacklo_epi64(p2, q2);
+  __m128i qp1 = _mm_unpacklo_epi64(p1, q1);
+  __m128i qp0 = _mm_unpacklo_epi64(p0, q0);
+
+  const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh);
+  const __m128i v_needs_mask =
+      NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh);
+
+  __m128i oqp1;
+  __m128i oqp0;
+
+  Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth);
+
+  const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh);
+  const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
+  const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
+
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
+    const __m128i v_isflatouter4_mask =
+        IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
+    const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
+    const __m128i v_flat4_mask =
+        _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo);
+
+    __m128i oqp2_f8;
+    __m128i oqp1_f8;
+    __m128i oqp0_f8;
+
+    Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8);
+
+    oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask);
+    oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
+    oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
+
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
+      __m128i oqp5_f14;
+      __m128i oqp4_f14;
+      __m128i oqp3_f14;
+      __m128i oqp2_f14;
+      __m128i oqp1_f14;
+      __m128i oqp0_f14;
+
+      Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14,
+               &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14);
+
+      oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask);
+      oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask);
+      oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask);
+      oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask);
+      oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask);
+      oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask);
+      qp3 = oqp3_f14;
+      qp4 = oqp4_f14;
+      qp5 = oqp5_f14;
+    }
+    qp2 = oqp2_f8;
+  }
+
+  TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 3 * stride, x3);
+
+  TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1,
+                         &x2, &x3);
+
+  StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0);
+  StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1);
+  StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2);
+  StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3);
+}
+
+using Defs10bpp = LoopFilterFuncs_SSE4_1<kBitdepth10>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+      Defs10bpp::Horizontal14;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical4;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical6;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical8;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical)
+  dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+      Defs10bpp::Vertical14;
+#endif
+}
+#endif
+}  // namespace
+}  // namespace high_bitdepth
+
+void LoopFilterInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h
new file mode 100644
index 0000000..4795d8b
--- /dev/null
+++ b/src/dsp/x86/loop_filter_sse4.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_filters, see the defines below for specifics. This
+// function is not thread-safe.
+void LoopFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical
+#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..daf5c42
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -0,0 +1,3163 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m256i sum0 = _mm256_add_epi32(s[0], round);
+  const __m256i sum1 = _mm256_add_epi32(s[1], round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum1 =
+      _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+  const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+  const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+  const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+  const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+  __m256i madds[4];
+  madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+  madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+  madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+  madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+  madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+  const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+  const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+  const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+  const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+  const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+  madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+  madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+  filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[7];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      s[5] = LoadUnaligned32(src + x + 5);
+      s[6] = LoadUnaligned32(src + x + 6);
+      WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m256i filter =
+      _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[5];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[3];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s0 = LoadUnaligned32(src + x);
+      const __m256i d0 = _mm256_slli_epi16(s0, 4);
+      StoreAligned32(*wiener_buffer + x, d0);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+  const __m256i sum = _mm256_add_epi32(madd01, madd23);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i sum = _mm256_add_epi32(madd01, madd2);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+  const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+  return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[4], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm256_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm256_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[3]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[3], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[3], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[3], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[3] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[2] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m256i filter[2];
+  filter[0] =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+      StoreUnaligned32(dst + x, d[0][0]);
+      StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m256i a = LoadAligned32(wiener_buffer);
+  const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+  const __m256i c = _mm256_srai_epi16(b, 4);
+  const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+  const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+  // The next 4 lines are much faster than:
+  // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+  // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+  StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+  StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+  return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+  dst[0] = _mm256_madd_epi16(s0, s0);
+  dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3, __m256i* const row5) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3_0, __m256i* const row3_1,
+                            __m256i* const row5_0, __m256i* const row5_1) {
+  SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+  SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                            __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+                            __m256i* const row_sq5) {
+  const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+                            __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                            __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass1_128 - sizeof(*src) * width;
+  const ptrdiff_t overread_in_bytes_256 =
+      kOverreadInBytesPass1_256 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+    __m256i sq[8];
+    s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+    s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s0[0], sq_128 + 0);
+    Square(s0[1], sq_128 + 2);
+    SumHorizontal16(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      SumHorizontal16(
+          src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+          &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int overread_in_bytes_128, overread_in_bytes_256;
+  if (size == 3) {
+    overread_in_bytes_128 = kOverreadInBytesPass2_128;
+    overread_in_bytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    overread_in_bytes_128 = kOverreadInBytesPass1_128;
+    overread_in_bytes_256 = kOverreadInBytesPass1_256;
+  }
+  overread_in_bytes_128 -= sizeof(*src) * width;
+  overread_in_bytes_256 -= sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s_128[2], ss, sq_128[4], sqs[2];
+    __m256i sq[8];
+    s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+    s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s_128[0], sq_128 + 0);
+    Square(s_128[1], sq_128 + 2);
+    if (size == 3) {
+      ss = Sum3Horizontal16(s_128);
+      Sum3Horizontal32(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal16(s_128);
+      Sum5Horizontal32(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row[2], row_sq[4];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 4, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 4, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i b = VrshrU16(sum, 2);
+  const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);  // 0 2 1 3
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x63);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  __m256i sums[2];
+  sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+  sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+  if (n == 9) {
+    CalculateB3(sums[0], maq0, b0);
+    CalculateB3(sums[1], maq1, b1);
+  } else {
+    CalculateB5(sums[0], maq0, b0);
+    CalculateB5(sums[1], maq1, b1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+  StoreAligned64(b444 + x, sum_b444);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi,
+                         __m256i* const sum_ma444_lo,
+                         __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+                         __m256i sum_b444_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_mat343[2], sum_mat444[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+  *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+  StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+  StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  StoreAligned64_ma(ma444 + x, sum_ma444);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+  // Input:
+  //                             0     1      2     3  // b[0]
+  //                             4     5      6     7  // b[1]
+  //  8     9     10    11      24    25     26    27  // t[0]
+  // 12    13     14    15      28    29     30    31  // t[1]
+  // 16    17     18    19      32    33     34    35  // t[2]
+  // 20    21     22    23      36    37     38    39  // t[3]
+
+  // Output:
+  //  0     1      2     3       8     9     10    11  // b[0]
+  //  4     5      6     7      12    13     14    15  // b[1]
+  //  8     9     10    11      16    17     18    19  // b[2]
+  // 16    17     18    19      24    25     26    27  // b[3]
+  // 20    21     22    23      28    29     30    31  // b[4]
+  // 24    25     26    27      32    33     34    35  // b[5]
+  // 20    21     22    23      36    37     38    39  // b[6]
+  b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+  b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+  b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+  b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+  b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+  b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+  b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+    __m256i b[3]) {
+  __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+  s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+  s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+    __m256i ma[3], __m256i b[7]) {
+  __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s[0], sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+  s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[1], sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+    __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+    __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+  __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+      index_3[2][2], sum_5[2], index_5[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                  &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+  PermuteB(t, b3[0]);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+  PermuteB(t, b3[1]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+  PermuteB(t, b3);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[10];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 3, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src) * width;
+  __m128i s[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(
+        src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+        x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64_ma(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 3, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+      b5_128[10];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_128[0], b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64_ma(ma343[0] + x, ma);
+    Sum343(b3[0], b);
+    Sum343(b3[0] + 3, b + 2);
+    StoreAligned64(b343[0] + x, b);
+    StoreAligned64(b343[0] + x + 16, b + 2);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+                   b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(b5, b);
+    StoreAligned64(b565, b);
+    Sum565(b5 + 3, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+  const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+  const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[1]);
+    StoreAligned32(ma565[1] + x + 16, ma[3]);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    StoreAligned64(b565[1] + x, b[1]);
+    StoreAligned64(b565[1] + x + 16, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+    const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    ClipAndStore(dst + stride + x + 16, d11);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma0[2], sq_128[8], b0[6];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+                                b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0[0], ma0[0]);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    LoadAligned64(b565 + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma565 + x + 16);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    LoadAligned64(b565 + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+  __m128i s0[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+  s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+  Square(s0[0], sq_128);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(
+        src0 + x + 8,
+        kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+                 b343[2], b444[1]);
+    const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+        ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+                 &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+                 ma444[1], b343[2], b444[1]);
+    Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+                 ma343[3], ma444[2], b343[3], b444[2]);
+
+    ma[0][2] = Sum565Lo(ma5x);
+    ma[0][3] = Sum565Hi(ma5x);
+    ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+    ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+    StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+    Sum565(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    // Keeping the following 4 redundant lines is faster. The reason is that
+    // there are not enough registers available, and these values could be saved
+    // and loaded which is even slower.
+    ma[1][2] = LoadAligned32(ma343[2] + x);  // Redundant line 1.
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    ma[2][1] = LoadAligned32(ma444[1] + x);  // Redundant line 2.
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ClipAndStore(dst + x, d00);
+    const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+    ClipAndStore(dst + stride + x, d10x);
+
+    Sum565(b5 + 3, bt[0][1]);
+    StoreAligned64(b565[1] + x + 16, bt[0][1]);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, bt[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    mat[1][2] = LoadAligned32(ma343[2] + x + 16);  // Redundant line 3.
+    LoadAligned64(b343[0] + x + 16, bt[1][0]);
+    LoadAligned64(b444[0] + x + 16, bt[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    mat[2][1] = LoadAligned32(ma444[1] + x + 16);  // Redundant line 4.
+    LoadAligned64(b343[1] + x + 16, bt[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 16, d01);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 16, d11);
+
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+  __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+    BoxFilterPreProcessLastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+        b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[2] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    mat[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 3, bt[1]);
+    ma[3] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 3, bt[2]);
+
+    const __m256i sr_lo = LoadUnaligned32(src + x);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+    mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, bt[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+    LoadAligned64(b343 + x + 16, bt[0]);
+    LoadAligned64(b444 + x + 16, bt[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[5];
+    b3[1] = b3[6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..6625d51
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -0,0 +1,2536 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m128i sum0 = _mm_add_epi32(s[0], round);
+  const __m128i sum1 = _mm_add_epi32(s[1], round);
+  const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+  filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], madds[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+      const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+      const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+      const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+      const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+      const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+      madds[0] = _mm_madd_epi16(ss0, filter[0]);
+      madds[1] = _mm_madd_epi16(ss1, filter[0]);
+      madds[2] = _mm_madd_epi16(ss2, filter[1]);
+      madds[3] = _mm_madd_epi16(ss3, filter[1]);
+      madds[0] = _mm_add_epi32(madds[0], madds[2]);
+      madds[1] = _mm_add_epi32(madds[1], madds[3]);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i filter =
+      _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+      const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+      const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+      const __m128i s0m = _mm_sub_epi16(s04, s2d);
+      const __m128i s1m = _mm_sub_epi16(s13, s2d);
+      const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+      const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+      const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+      madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+      madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+      const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i d = _mm_slli_epi16(s, 4);
+      StoreAligned16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum = _mm_add_epi32(madd01, madd23);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum = _mm_add_epi32(madd01, madd2);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+  const __m128i d = _mm_packus_epi32(s[0], s[1]);
+  return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[4], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[3]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[3], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[3], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[3] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[8], d[2];
+      d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+      d[1] = WienerVerticalFilter7(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[2] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[6], d[2];
+      d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+      d[1] = WienerVerticalFilter5(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m128i filter[2];
+  filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[4], d[2];
+      d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+      d[1] = WienerVerticalFilter3(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m128i a = LoadAligned16(wiener_buffer);
+  const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+  const __m128i c = _mm_srai_epi16(b, 4);
+  const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+  const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+  StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i coefficients_horizontal =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+                            __m128i* const row3_1, __m128i* const row5_0,
+                            __m128i* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[4]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src + x + 16,
+                               overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src + x + 24,
+                               overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+  const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+  const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+  StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], p[2];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const __m128i sr0_lo = LoadAligned16(src + x + 0);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0_lo = LoadAligned16(src + x);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x);
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..30e8a22
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.cc
@@ -0,0 +1,2947 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+  // The sum range here is [-128 * 255, 90 * 255].
+  const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+  const __m256i sum = _mm256_add_epi16(madd, round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+                                       const __m256i filter[4],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+  __m256i madds[4];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+                                       const __m256i filter[3],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  __m256i madds[3];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+                                            kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  __m256i madds[2];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+  filter[3] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+  filter[2] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[1] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s = LoadUnaligned32(src + x);
+      const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+      const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+      __m256i d[2];
+      d[0] = _mm256_slli_epi16(s0, 4);
+      d[1] = _mm256_slli_epi16(s1, 4);
+      StoreAligned64(*wiener_buffer + x, d);
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum0 = _mm256_add_epi32(round, madd0);
+  const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+  return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd = _mm256_madd_epi16(a, filter);
+  const __m256i sum = _mm256_add_epi32(round, madd);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  __m256i b[2];
+  const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+  const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+  const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+  b[0] = _mm256_unpacklo_epi16(a06, a15);
+  b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+  const __m256i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a06, a15);
+  b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+  const __m256i sum1 = WienerVertical7(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2];
+  const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+  const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+  b[0] = _mm256_unpacklo_epi16(a04, a13);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  const __m256i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a04, a13);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  const __m256i sum1 = WienerVertical5(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+  __m256i b;
+  const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+  b = _mm256_unpacklo_epi16(a02, a[1]);
+  const __m256i sum0 = WienerVertical3(b, filter);
+  b = _mm256_unpackhi_epi16(a02, a[1]);
+  const __m256i sum1 = WienerVertical3(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter, __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter, __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i filter =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+  const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+  const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+  const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+  const __m256i c0 = _mm256_srai_epi16(b0, 4);
+  const __m256i c1 = _mm256_srai_epi16(b1, 4);
+  const __m256i d = _mm256_packus_epi16(c0, c1);
+  StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 32;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 32);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  __m128i c_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+  const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+  const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+  const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+  const __m256i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+  const __m256i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlLo8(src[0], src[1]);
+  const __m256i sum23 = VaddlLo8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlHi8(src[0], src[1]);
+  const __m256i sum23 = VaddlHi8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes,
+                           __m256i* const dst0, __m256i* const dst1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+                          const ptrdiff_t over_read_in_bytes,
+                          __m256i* const row3_0, __m256i* const row3_1,
+                          __m256i* const row5_0, __m256i* const row5_1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+                          __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                          __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    const __m128i s0 =
+        LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+    __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s0);
+    sq_128[1] = SquareHi8(s0);
+    SumHorizontalLo(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+                    &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[2];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int kOverreadInBytes_128, kOverreadInBytes_256;
+  if (size == 3) {
+    kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+  }
+  int y = 2;
+  do {
+    const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+    __m128i ss, sq_128[2], sqs[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s);
+    sq_128[1] = SquareHi8(s);
+    if (size == 3) {
+      ss = Sum3Horizontal(s);
+      Sum3WHorizontal(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal(s);
+      Sum5WHorizontal(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row[2], row_sq[4];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      if (size == 3) {
+        Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[2];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x93);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  if (n == 9) {
+    b[0] = CalculateB3(sum[0], maq0);
+    b[1] = CalculateB3(sum[1], maq1);
+  } else {
+    b[0] = CalculateB5(sum[0], maq0);
+    b[1] = CalculateB5(sum[1], maq1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  StoreAligned64(b444 + x, sum_b444);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+    __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+    __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[4], sq3[3][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+    __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0]);
+  sq[1][1] = SquareHi8(s[1]);
+  SumHorizontalLo(s[0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  // Note: in the SSE4_1 version, CalculateIntermediate() is called
+  // to replace the slow LookupIntermediate() when calculating 16 intermediate
+  // data points. However, the AVX2 compiler generates even slower code. So we
+  // keep using CalculateIntermediate3().
+  CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+    __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+      sum_5[2], index_5[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+  b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2];
+  sq[1] = SquareLo8(s0);
+  sq[2] = SquareHi8(s0);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, sq_128[2], b0;
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+                         x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64(ma343[0] + x, ma);
+    Sum343W(b3[0], b);
+    StoreAligned64(b343[0] + x, b);
+    Sum565W(b5, b);
+    StoreAligned64(b565, b);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64(ma565, ma);
+    Sum343W(b3[0] + 1, b);
+    StoreAligned64(b343[0] + x + 16, b);
+    Sum565W(b5 + 1, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2][2];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    StoreAligned64(ma565[1] + x, ma + 1);
+    Sum565W(bs + 0, b[0][1]);
+    Sum565W(bs + 1, b[1][1]);
+    StoreAligned64(b565[1] + x + 0, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[1][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+    const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+    const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    ma[1] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[1][0]);
+    const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+    const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+                                &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    Sum565W(bs + 0, b[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565);
+    LoadAligned64(b565 + 0, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma565 + 16);
+    LoadAligned64(b565 + 16, b[0]);
+    Sum565W(bs + 1, b[1]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(src0 + x + 8,
+                         x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+                         sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    ma[0][2] = Sum565Hi(ma5x);
+    mat[0][1] = ma[0][2];
+    StoreAligned64(ma565[1] + x, ma[0] + 1);
+    Sum565W(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[0][1]);
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1][0]);
+    LoadAligned64(b444[0] + x + 16, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    LoadAligned64(b343[1] + x + 16, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+  __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_0, b3_0);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+    BoxFilterPreProcessLastRow(src0 + x + 8,
+                               x + 8 + kOverreadInBytesPass1_256 - width,
+                               sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565 + x);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    mat[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    LoadAligned64(b343 + x + 16, b[0]);
+    LoadAligned64(b444 + x + 16, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2
diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..2c3534a
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_avx2.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc
new file mode 100644
index 0000000..3363f0e
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.cc
@@ -0,0 +1,2582 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+  const __m128i sum = _mm_add_epi16(s[0], s[1]);
+  const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
+  const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+                                       const __m128i filter[4],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[4];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm_add_epi16(madds[1], madds[3]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
+                                       const __m128i filter[3],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[3];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  const __m128i s_3x128 =
+      _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m128i s[2],
+                                       const __m128i filter[2],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[2];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient0,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
+  filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], ss[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+      ss[3] = _mm_unpacklo_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+      ss[3] = _mm_unpackhi_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient1,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
+  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
+  filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], ss[3];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const int coefficient2,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
+  filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], ss[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
+      const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
+      const __m128i d0 = _mm_slli_epi16(s0, 4);
+      const __m128i d1 = _mm_slli_epi16(s1, 4);
+      StoreAligned16(*wiener_buffer + x + 0, d0);
+      StoreAligned16(*wiener_buffer + x + 8, d1);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum0 = _mm_add_epi32(round, madd0);
+  const __m128i sum1 = _mm_add_epi32(sum0, madd1);
+  return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m128i madd = _mm_madd_epi16(a, filter);
+  const __m128i sum = _mm_add_epi32(round, madd);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  __m128i b[2];
+  const __m128i a06 = _mm_add_epi16(a[0], a[6]);
+  const __m128i a15 = _mm_add_epi16(a[1], a[5]);
+  const __m128i a24 = _mm_add_epi16(a[2], a[4]);
+  b[0] = _mm_unpacklo_epi16(a06, a15);
+  b[1] = _mm_unpacklo_epi16(a24, a[3]);
+  const __m128i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a06, a15);
+  b[1] = _mm_unpackhi_epi16(a24, a[3]);
+  const __m128i sum1 = WienerVertical7(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2];
+  const __m128i a04 = _mm_add_epi16(a[0], a[4]);
+  const __m128i a13 = _mm_add_epi16(a[1], a[3]);
+  b[0] = _mm_unpacklo_epi16(a04, a13);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  const __m128i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a04, a13);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  const __m128i sum1 = WienerVertical5(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
+  __m128i b;
+  const __m128i a02 = _mm_add_epi16(a[0], a[2]);
+  b = _mm_unpacklo_epi16(a02, a[1]);
+  const __m128i sum0 = WienerVertical3(b, filter);
+  b = _mm_unpackhi_epi16(a02, a[1]);
+  const __m128i sum1 = WienerVertical3(b, filter);
+  return _mm_packs_epi32(sum0, sum1);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter, __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter[2], __m128i d[2]) {
+  __m128i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m128i filter, __m128i d[2]) {
+  __m128i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = Load4(coefficients);
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i filter =
+      _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
+      StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
+      StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m128i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
+      StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m128i a0 = LoadAligned16(wiener_buffer + 0);
+  const __m128i a1 = LoadAligned16(wiener_buffer + 8);
+  const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
+  const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
+  const __m128i c0 = _mm_srai_epi16(b0, 4);
+  const __m128i c1 = _mm_srai_epi16(b1, 4);
+  const __m128i d = _mm_packus_epi16(c0, c1);
+  StoreAligned16(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const __m128i c = LoadLo8(filter_horizontal);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  const __m128i coefficients_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[0],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[1],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[2],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m128i Sum5WHi16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlHi8(src[0], src[1]);
+  const __m128i sum23 = VaddlHi8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+                           __m128i* const dst1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+                   __m128i* const row3_1, __m128i* const row5_0,
+                   __m128i* const row5_1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src,
+                                 sum_width - x + kOverreadInBytesPass1 - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  constexpr int kOverreadInBytes =
+      (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      if (size == 3) {
+        Sum3Horizontal<0>(s, row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal<0>(s, &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i* const b0,
+                                  __m128i* const b1) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b0 = CalculateB3(sum[0], maq0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  *b1 = CalculateB3(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[2]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  StoreAligned32U32(b444 + x, sum_b444);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  Sum5WHorizontal(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5WHorizontal(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[2] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum5WHorizontal(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum3WHorizontal(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343W(b3[0] + 0, b + 0);
+    Sum343W(b3[0] + 1, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565W(b5 + 0, b + 0);
+    Sum565W(b5 + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565W(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    sr[0] = LoadAligned16(src + x);
+    sr[1] = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565W(bs + 1, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565W(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565W(bs + 1, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565W(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0 = LoadAligned16(src + x);
+    const __m128i sr1 = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+                               square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
+    const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
+    const void* LIBGAV1_RESTRICT const top_border,
+    const ptrdiff_t top_border_stride,
+    const void* LIBGAV1_RESTRICT const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
+    void* LIBGAV1_RESTRICT const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h
new file mode 100644
index 0000000..00df3af
--- /dev/null
+++ b/src/dsp/x86/loop_restoration_sse4.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc
new file mode 100644
index 0000000..a18444b
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.cc
@@ -0,0 +1,959 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/mask_blend.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Width can only be 4 when it is subsampled from a block of width 8, hence
+// subsampling_x is always 1 when this function is called.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  if (subsampling_x == 1) {
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+    const __m128i mask_val_1 =
+        _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+    if (subsampling_y == 1) {
+      const __m128i next_mask_val_0 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride));
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+  }
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+// This function returns a 16-bit packed mask to fit in _mm_madd_epi16.
+// 16-bit is also the lowest packing for hadd, but without subsampling there is
+// an unfortunate conversion required.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+                        ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+    if (subsampling_y == 1) {
+      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because,
+// when is_inter_intra is true, the prediction values are brought to 8-bit
+// packing as well.
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetInterIntraMask8(const uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t stride) {
+  if (subsampling_x == 1) {
+    const __m128i row_vals = LoadUnaligned16(mask);
+
+    const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+    const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+    __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+
+    if (subsampling_y == 1) {
+      const __m128i next_row_vals = LoadUnaligned16(mask + stride);
+      const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals);
+      const __m128i next_mask_val_1 =
+          _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8));
+      subsampled_mask = _mm_add_epi16(
+          subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1));
+    }
+    const __m128i ret =
+        RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y);
+    return _mm_packus_epi16(ret, ret);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  // Unfortunately there is no shift operation for 8-bit packing, or else we
+  // could return everything with 8-bit packing.
+  const __m128i mask_val = LoadLo8(mask);
+  return mask_val;
+}
+
+inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
+                                  const int16_t* LIBGAV1_RESTRICT const pred_1,
+                                  const __m128i pred_mask_0,
+                                  const __m128i pred_mask_1,
+                                  uint8_t* LIBGAV1_RESTRICT dst,
+                                  const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadAligned16(pred_0);
+  const __m128i pred_val_1 = LoadAligned16(pred_1);
+  const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+  const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+  const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  // int res = (mask_value * prediction_0[x] +
+  //      (64 - mask_value) * prediction_1[x]) >> 6;
+  const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+  const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+  const __m128i compound_pred = _mm_packus_epi32(
+      _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6));
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i result = RightShiftWithRounding_S16(compound_pred, 4);
+  const __m128i res = _mm_packus_epi16(result, result);
+  Store4(dst, res);
+  Store4(dst + dst_stride, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4x4_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT mask,
+                                 const ptrdiff_t mask_stride,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += 4 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                        dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending4xH_SSE4(const int16_t* LIBGAV1_RESTRICT pred_0,
+                                 const int16_t* LIBGAV1_RESTRICT pred_1,
+                                 const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                                 const ptrdiff_t mask_stride, const int height,
+                                 uint8_t* LIBGAV1_RESTRICT dst,
+                                 const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlending4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+
+    pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
+                          dst_stride);
+    pred_0 += 4 << 1;
+    pred_1 += 4 << 1;
+    mask += mask_stride << (1 + subsampling_y);
+    dst += dst_stride << 1;
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           const ptrdiff_t /*prediction_stride_1*/,
+                           const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+                           const ptrdiff_t mask_stride, const int width,
+                           const int height, void* LIBGAV1_RESTRICT dest,
+                           const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = width;
+  if (width == 4) {
+    MaskBlending4xH_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
+      const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadAligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadAligned16(pred_1 + x);
+      const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_0[x] +
+      //      (64 - mask_value) * prediction_1[x]) >> 6;
+      const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo);
+      const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi);
+
+      const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6),
+                                           _mm_srli_epi32(compound_pred_hi, 6));
+      // dst[x] = static_cast<Pixel>(
+      //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+      //           (1 << kBitdepth8) - 1));
+      const __m128i result = RightShiftWithRounding_S16(res, 4);
+      StoreLo8(dst + x, _mm_packus_epi16(result, result));
+
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+inline void InterIntraWriteMaskBlendLine8bpp4x2(
+    const uint8_t* LIBGAV1_RESTRICT const pred_0,
+    uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
+    const __m128i pred_mask_0, const __m128i pred_mask_1) {
+  const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+  const __m128i pred_val_0 = LoadLo8(pred_0);
+  // TODO(b/150326556): One load.
+  __m128i pred_val_1 = Load4(pred_1);
+  pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
+                            pred_val_1);
+  const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+  // int res = (mask_value * prediction_1[x] +
+  //      (64 - mask_value) * prediction_0[x]) >> 6;
+  const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+  const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+  const __m128i res = _mm_packus_epi16(result, result);
+
+  Store4(pred_1, res);
+  Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4x4_SSE4(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+    const ptrdiff_t mask_stride) {
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i pred_mask_u16_first =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  const __m128i pred_mask_u16_second =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+  mask += mask_stride << (1 + subsampling_y);
+  __m128i pred_mask_1 =
+      _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second);
+  __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+
+  pred_mask_1 = _mm_srli_si128(pred_mask_1, 8);
+  pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+  InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp4xH_SSE4(
+    const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+    const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    return;
+  }
+  int y = 0;
+  do {
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+
+    InterIntraMaskBlending8bpp4x4_SSE4<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride);
+    pred_0 += 4 << 2;
+    pred_1 += pred_stride_1 << 2;
+    mask += mask_stride << (2 + subsampling_y);
+    y += 8;
+  } while (y < height);
+}
+
+template <int subsampling_x, int subsampling_y>
+void InterIntraMaskBlend8bpp_SSE4(
+    const uint8_t* LIBGAV1_RESTRICT prediction_0,
+    uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height) {
+  if (width == 4) {
+    InterIntraMaskBlending8bpp4xH_SSE4<subsampling_x, subsampling_y>(
+        prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+        height);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_1 =
+          GetInterIntraMask8<subsampling_x, subsampling_y>(
+              mask + (x << subsampling_x), mask_stride);
+      // 64 - mask
+      const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1);
+      const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
+
+      const __m128i pred_val_0 = LoadLo8(prediction_0 + x);
+      const __m128i pred_val_1 = LoadLo8(prediction_1 + x);
+      const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1);
+      // int res = (mask_value * prediction_1[x] +
+      //      (64 - mask_value) * prediction_0[x]) >> 6;
+      const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask);
+      const __m128i result = RightShiftWithRounding_U16(compound_pred, 6);
+      const __m128i res = _mm_packus_epi16(result, result);
+
+      StoreLo8(prediction_1 + x, res);
+
+      x += 8;
+    } while (x < width);
+    prediction_0 += width;
+    prediction_1 += prediction_stride_1;
+    mask += mask_stride << subsampling_y;
+  } while (++y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>;
+#endif
+  // The is_inter_intra index of mask_blend[][] is replaced by
+  // inter_intra_mask_blend_8bpp[] in 8-bit.
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444)
+  dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422)
+  dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420)
+  dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
+                                              const __m128i zero) {
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, zero);
+}
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+                                               const __m128i shift) {
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
+                          const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+      const __m128i mask_val_1 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 =
+        LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
+    const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
+                                       mask + (mask_stride << 1) + mask_stride);
+    const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
+    return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
+                        const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i row_vals = LoadUnaligned16(mask);
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+      const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const __m128i& pred_mask_0, const __m128i& pred_mask_1,
+    const __m128i& offset, const __m128i& max, const __m128i& shift4,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+  const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+  // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+  const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+  const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+  const __m128i pack0_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack0_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack1_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i pack1_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+  const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+  // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+  const __m128i sub_0 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+  const __m128i sub_1 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* LIBGAV1_RESTRICT pred_0,
+                                     const uint16_t* LIBGAV1_RESTRICT pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* LIBGAV1_RESTRICT mask,
+                                     const ptrdiff_t mask_stride,
+                                     uint16_t* LIBGAV1_RESTRICT dst,
+                                     const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height, uint16_t* LIBGAV1_RESTRICT dst,
+    const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height, void* LIBGAV1_RESTRICT dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+      const __m128i compound_pred_lo_0 =
+          _mm_mullo_epi16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_hi_0 =
+          _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_lo_1 =
+          _mm_mullo_epi16(pred_val_1, pred_mask_1);
+      const __m128i compound_pred_hi_1 =
+          _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+      const __m128i pack0_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack0_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack1_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i pack1_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+      const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+      const __m128i sub_0 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+      const __m128i sub_1 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+      const __m128i result =
+          _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+      StoreUnaligned16(dst + x, result);
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& shift6,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+  const __m128i pred_val_1 =
+      LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+  const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+  const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+  const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+  const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+  const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+  StoreLo8(dst, res);
+  StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT mask, const ptrdiff_t mask_stride,
+    uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(
+    const uint16_t* LIBGAV1_RESTRICT pred_0,
+    const uint16_t* LIBGAV1_RESTRICT pred_1, const ptrdiff_t pred_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int height, uint16_t* LIBGAV1_RESTRICT dst,
+    const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+    const void* LIBGAV1_RESTRICT prediction_0,
+    const void* LIBGAV1_RESTRICT prediction_1,
+    const ptrdiff_t prediction_stride_1,
+    const uint8_t* LIBGAV1_RESTRICT const mask_ptr, const ptrdiff_t mask_stride,
+    const int width, const int height, void* LIBGAV1_RESTRICT dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+      const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+      const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+      const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+      const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+      StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+  dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+  dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+  dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void MaskBlendInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h
new file mode 100644
index 0000000..4a95f0c
--- /dev/null
+++ b/src/dsp/x86/mask_blend_sse4.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mask_blend. This function is not thread-safe.
+void MaskBlendInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
+#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
+#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
+#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
+#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc
new file mode 100644
index 0000000..5641531
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.cc
@@ -0,0 +1,382 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_field_projection.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline __m128i LoadDivision(const __m128i division_table,
+                            const __m128i reference_offset) {
+  const __m128i kOne = _mm_set1_epi16(0x0100);
+  const __m128i t = _mm_add_epi8(reference_offset, reference_offset);
+  const __m128i tt = _mm_unpacklo_epi8(t, t);
+  const __m128i idx = _mm_add_epi8(tt, kOne);
+  return _mm_shuffle_epi8(division_table, idx);
+}
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const int numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator));
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator,
+                                const int numerator) {
+  const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128());
+  const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128());
+  const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128());
+  const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128());
+  const __m128i s0 = MvProjection(mv0, denorm0, numerator);
+  const __m128i s1 = MvProjection(mv1, denorm1, numerator);
+  const __m128i projection = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) {
+  // Add 63 to negative delta so that it shifts towards zero.
+  const __m128i delta_sign = _mm_srai_epi16(delta, 15);
+  const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10);
+  const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63);
+  const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6);
+  const __m128i offset1 = _mm_xor_si128(offset0, dst_sign);
+  return _mm_sub_epi16(offset1, dst_sign);
+}
+
+inline void GetPosition(
+    const __m128i division_table, const MotionVector* const mv,
+    const int numerator, const int x8_start, const int x8_end, const int x8,
+    const __m128i& r_offsets, const __m128i& source_reference_type8,
+    const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8,
+    const __m128i& d_sign, const int delta, __m128i* const r,
+    __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) {
+  const auto* const mv_int = reinterpret_cast<const int32_t*>(mv + x8);
+  *r = _mm_shuffle_epi8(r_offsets, source_reference_type8);
+  const __m128i denorm = LoadDivision(division_table, source_reference_type8);
+  __m128i projection_mv[2];
+  mvs[0] = LoadUnaligned16(mv_int + 0);
+  mvs[1] = LoadUnaligned16(mv_int + 4);
+  // Deinterlace x and y components
+  const __m128i kShuffle =
+      _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+  const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle);
+  const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle);
+  const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1);
+  const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1);
+  // numerator could be 0.
+  projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator);
+  projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator);
+  // Do not update the motion vector if the block position is not valid or
+  // if position_x8 is outside the current range of x8_start and x8_end.
+  // Note that position_y8 will always be within the range of y8_start and
+  // y8_end.
+  // After subtracting the base, valid projections are within 8-bit.
+  const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign);
+  const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign);
+  const __m128i positions = _mm_packs_epi16(position_x, position_y);
+  const __m128i k01234567 =
+      _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
+  *position_xy = _mm_add_epi8(positions, k01234567);
+  const int x8_floor = std::max(
+      x8_start - x8, delta - kProjectionMvMaxHorizontalOffset);  // [-8, 8]
+  const int x8_ceiling =
+      std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) -
+      1;  // [-1, 15]
+  const __m128i x8_floor8 = _mm_set1_epi8(x8_floor);
+  const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling);
+  const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8);
+  const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8);
+  const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy);
+  const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy);
+  const __m128i out = _mm_or_si128(underflow, overflow);
+  const __m128i skip_low = _mm_or_si128(skip_r, out);
+  const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8));
+  StoreLo8(skip_64, skip);
+}
+
+template <int idx>
+inline void Store(const __m128i position, const __m128i reference_offset,
+                  const __m128i mv, int8_t* dst_reference_offset,
+                  MotionVector* dst_mv) {
+  const ptrdiff_t offset =
+      static_cast<int16_t>(_mm_extract_epi16(position, idx));
+  if ((idx & 3) == 0) {
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
+  } else {
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
+  }
+  dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
+}
+
+template <int idx>
+inline void CheckStore(const int8_t* skips, const __m128i position,
+                       const __m128i reference_offset, const __m128i mv,
+                       int8_t* dst_reference_offset, MotionVector* dst_mv) {
+  if (skips[idx] == 0) {
+    Store<idx>(position, reference_offset, mv, dst_reference_offset, dst_mv);
+  }
+}
+
+// 7.9.2.
+void MotionFieldProjectionKernel_SSE4_1(
+    const ReferenceInfo& reference_info,
+    const int reference_to_current_with_sign, const int dst_sign,
+    const int y8_start, const int y8_end, const int x8_start, const int x8_end,
+    TemporalMotionField* const motion_field) {
+  const ptrdiff_t stride = motion_field->mv.columns();
+  // The column range has to be offset by kProjectionMvMaxHorizontalOffset since
+  // coordinates in that range could end up being position_x8 because of
+  // projection.
+  const int adjusted_x8_start =
+      std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0);
+  const int adjusted_x8_end = std::min(
+      x8_end + kProjectionMvMaxHorizontalOffset, static_cast<int>(stride));
+  const int adjusted_x8_end8 = adjusted_x8_end & ~7;
+  const int leftover = adjusted_x8_end - adjusted_x8_end8;
+  const int8_t* const reference_offsets =
+      reference_info.relative_distance_to.data();
+  const bool* const skip_references = reference_info.skip_references.data();
+  const int16_t* const projection_divisions =
+      reference_info.projection_divisions.data();
+  const ReferenceFrameType* source_reference_types =
+      &reference_info.motion_field_reference_frame[y8_start][0];
+  const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0];
+  int8_t* dst_reference_offset = motion_field->reference_offset[y8_start];
+  MotionVector* dst_mv = motion_field->mv[y8_start];
+  const __m128i d_sign = _mm_set1_epi16(dst_sign);
+
+  static_assert(sizeof(int8_t) == sizeof(bool), "");
+  static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), "");
+  static_assert(sizeof(int32_t) == sizeof(MotionVector), "");
+  assert(dst_sign == 0 || dst_sign == -1);
+  assert(stride == motion_field->reference_offset.columns());
+  assert((y8_start & 7) == 0);
+  assert((adjusted_x8_start & 7) == 0);
+  // The final position calculation is represented with int16_t. Valid
+  // position_y8 from its base is at most 7. After considering the horizontal
+  // offset which is at most |stride - 1|, we have the following assertion,
+  // which means this optimization works for frame width up to 32K (each
+  // position is a 8x8 block).
+  assert(8 * stride <= 32768);
+  const __m128i skip_reference = LoadLo8(skip_references);
+  const __m128i r_offsets = LoadLo8(reference_offsets);
+  const __m128i division_table = LoadUnaligned16(projection_divisions);
+
+  int y8 = y8_start;
+  do {
+    const int y8_floor = (y8 & ~7) - y8;                             // [-7, 0]
+    const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1;  // [0, 7]
+    const __m128i y8_floor8 = _mm_set1_epi8(y8_floor);
+    const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling);
+    int x8;
+
+    for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) {
+      const __m128i source_reference_type8 =
+          LoadLo8(source_reference_types + x8);
+      const __m128i skip_r =
+          _mm_shuffle_epi8(skip_reference, source_reference_type8);
+      int64_t early_skip;
+      StoreLo8(&early_skip, skip_r);
+      // Early termination #1 if all are skips. Chance is typically ~30-40%.
+      if (early_skip == -1) continue;
+      int64_t skip_64;
+      __m128i r, position_xy, mvs[2];
+      GetPosition(division_table, mv, reference_to_current_with_sign, x8_start,
+                  x8_end, x8, r_offsets, source_reference_type8, skip_r,
+                  y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64,
+                  mvs);
+      // Early termination #2 if all are skips.
+      // Chance is typically ~15-25% after Early termination #1.
+      if (skip_64 == -1) continue;
+      const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+      const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+      const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+      const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+      const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+      if (skip_64 == 0) {
+        // Store all. Chance is typically ~70-85% after Early termination #2.
+        Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+        Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+        Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+      } else {
+        // Check and store each.
+        // Chance is typically ~15-30% after Early termination #2.
+        // The compiler is smart enough to not create the local buffer skips[].
+        int8_t skips[8];
+        memcpy(skips, &skip_64, sizeof(skips));
+        CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv);
+        CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+        CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv);
+      }
+    }
+
+    // The following leftover processing cannot be moved out of the do...while
+    // loop. Doing so may change the result storing orders of the same position.
+    if (leftover > 0) {
+      // Use SIMD only when leftover is at least 4, and there are at least 8
+      // elements in a row.
+      if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) {
+        // Process the last 8 elements to avoid loading invalid memory. Some
+        // elements may have been processed in the above loop, which is OK.
+        const int delta = 8 - leftover;
+        x8 = adjusted_x8_end - 8;
+        const __m128i source_reference_type8 =
+            LoadLo8(source_reference_types + x8);
+        const __m128i skip_r =
+            _mm_shuffle_epi8(skip_reference, source_reference_type8);
+        int64_t early_skip;
+        StoreLo8(&early_skip, skip_r);
+        // Early termination #1 if all are skips.
+        if (early_skip != -1) {
+          int64_t skip_64;
+          __m128i r, position_xy, mvs[2];
+          GetPosition(division_table, mv, reference_to_current_with_sign,
+                      x8_start, x8_end, x8, r_offsets, source_reference_type8,
+                      skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r,
+                      &position_xy, &skip_64, mvs);
+          // Early termination #2 if all are skips.
+          if (skip_64 != -1) {
+            const __m128i p_y =
+                _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8));
+            const __m128i p_x = _mm_cvtepi8_epi16(position_xy);
+            const __m128i p_y_offset =
+                _mm_mullo_epi16(p_y, _mm_set1_epi16(stride));
+            const __m128i pos = _mm_add_epi16(p_y_offset, p_x);
+            const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8));
+            // Store up to 7 elements since leftover is at most 7.
+            if (skip_64 == 0) {
+              // Store all.
+              Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv);
+              Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv);
+              Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv);
+            } else {
+              // Check and store each.
+              // The compiler is smart enough to not create the local buffer
+              // skips[].
+              int8_t skips[8];
+              memcpy(skips, &skip_64, sizeof(skips));
+              CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset,
+                            dst_mv);
+              CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+              CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset,
+                            dst_mv);
+            }
+          }
+        }
+      } else {
+        for (; x8 < adjusted_x8_end; ++x8) {
+          const int source_reference_type = source_reference_types[x8];
+          if (skip_references[source_reference_type]) continue;
+          MotionVector projection_mv;
+          // reference_to_current_with_sign could be 0.
+          GetMvProjection(mv[x8], reference_to_current_with_sign,
+                          projection_divisions[source_reference_type],
+                          &projection_mv);
+          // Do not update the motion vector if the block position is not valid
+          // or if position_x8 is outside the current range of x8_start and
+          // x8_end. Note that position_y8 will always be within the range of
+          // y8_start and y8_end.
+          const int position_y8 = Project(0, projection_mv.mv[0], dst_sign);
+          if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue;
+          const int x8_base = x8 & ~7;
+          const int x8_floor =
+              std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset);
+          const int x8_ceiling =
+              std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset);
+          const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign);
+          if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue;
+          dst_mv[position_y8 * stride + position_x8] = mv[x8];
+          dst_reference_offset[position_y8 * stride + position_x8] =
+              reference_offsets[source_reference_type];
+        }
+      }
+    }
+
+    source_reference_types += stride;
+    mv += stride;
+    dst_reference_offset += stride;
+    dst_mv += stride;
+  } while (++y8 < y8_end);
+}
+
+}  // namespace
+
+void MotionFieldProjectionInit_SSE4_1() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionFieldProjectionInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h
new file mode 100644
index 0000000..c05422c
--- /dev/null
+++ b/src/dsp/x86/motion_field_projection_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::motion_field_projection_kernel. This function is not
+// thread-safe.
+void MotionFieldProjectionInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
+#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc
new file mode 100644
index 0000000..dacc6ec
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.cc
@@ -0,0 +1,251 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/motion_vector_search.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+inline __m128i MvProjection(const __m128i mv, const __m128i denominator,
+                            const __m128i numerator) {
+  const __m128i m0 = _mm_madd_epi16(mv, denominator);
+  const __m128i m = _mm_mullo_epi32(m0, numerator);
+  // Add the sign (0 or -1) to round towards zero.
+  const __m128i sign = _mm_srai_epi32(m, 31);
+  const __m128i add_sign = _mm_add_epi32(m, sign);
+  const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13));
+  return _mm_srai_epi32(sum, 14);
+}
+
+inline __m128i MvProjectionClip(const __m128i mvs[2],
+                                const __m128i denominators[2],
+                                const __m128i numerator) {
+  const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator);
+  const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator);
+  const __m128i mv = _mm_packs_epi32(s0, s1);
+  const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp);
+  const __m128i projection_mv_clamp_negative =
+      _mm_set1_epi16(-kProjectionMvClamp);
+  const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp);
+  return _mm_max_epi16(clamp, projection_mv_clamp_negative);
+}
+
+inline __m128i MvProjectionCompoundClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t temporal_reference_offsets[2],
+    const int reference_offsets[2]) {
+  const auto* const tmvs = reinterpret_cast<const int32_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadLo8(tmvs);
+  const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0);
+  mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0);
+  denominators[0] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[0]]);
+  denominators[1] = _mm_set1_epi32(
+      kProjectionMvDivisionLookup[temporal_reference_offsets[1]]);
+  const __m128i offsets = LoadLo8(reference_offsets);
+  const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline __m128i MvProjectionSingleClip(
+    const MotionVector* LIBGAV1_RESTRICT const temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT const temporal_reference_offsets,
+    const int reference_offset) {
+  const auto* const tmvs = reinterpret_cast<const int16_t*>(temporal_mvs);
+  const __m128i temporal_mv = LoadAligned16(tmvs);
+  __m128i lookup = _mm_cvtsi32_si128(
+      kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]],
+      1);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]],
+      2);
+  lookup = _mm_insert_epi32(
+      lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]],
+      3);
+  __m128i mvs[2], denominators[2];
+  mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128());
+  mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128());
+  denominators[0] = _mm_unpacklo_epi32(lookup, lookup);
+  denominators[1] = _mm_unpackhi_epi32(lookup, lookup);
+  const __m128i numerator = _mm_set1_epi32(reference_offset);
+  return MvProjectionClip(mvs, denominators, numerator);
+}
+
+inline void LowPrecision(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~1);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i sub_sign = _mm_sub_epi16(mv, sign);
+  const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask);
+  StoreAligned16(candidate_mvs, d);
+}
+
+inline void ForceInteger(const __m128i mv, void* const candidate_mvs) {
+  const __m128i kRoundDownMask = _mm_set1_epi16(~7);
+  const __m128i sign = _mm_srai_epi16(mv, 15);
+  const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3));
+  const __m128i mv2 = _mm_sub_epi16(mv1, sign);
+  const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask);
+  StoreAligned16(candidate_mvs, mv3);
+}
+
+void MvProjectionCompoundLowPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundForceInteger_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionCompoundHighPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offsets[2], const int count,
+    CompoundMotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // |reference_offsets| non-zero check usually equals true and is ignored.
+  // To facilitate the compilers, make a local copy of |reference_offsets|.
+  const int offsets[2] = {reference_offsets[0], reference_offsets[1]};
+  // One more element could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionCompoundClip(
+        temporal_mvs + i, temporal_reference_offsets + i, offsets);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 2;
+  } while (i < count);
+}
+
+void MvProjectionSingleLowPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    LowPrecision(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleForceInteger_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    ForceInteger(mv, candidate_mvs + i);
+    i += 4;
+  } while (i < count);
+}
+
+void MvProjectionSingleHighPrecision_SSE4_1(
+    const MotionVector* LIBGAV1_RESTRICT temporal_mvs,
+    const int8_t* LIBGAV1_RESTRICT temporal_reference_offsets,
+    const int reference_offset, const int count,
+    MotionVector* LIBGAV1_RESTRICT candidate_mvs) {
+  // Up to three more elements could be calculated.
+  int i = 0;
+  do {
+    const __m128i mv = MvProjectionSingleClip(
+        temporal_mvs + i, temporal_reference_offsets + i, reference_offset);
+    StoreAligned16(candidate_mvs + i, mv);
+    i += 4;
+  } while (i < count);
+}
+
+}  // namespace
+
+void MotionVectorSearchInit_SSE4_1() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1;
+  dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1;
+  dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1;
+  dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1;
+  dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1;
+  dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1;
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void MotionVectorSearchInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h
new file mode 100644
index 0000000..d65b392
--- /dev/null
+++ b/src/dsp/x86/motion_vector_search_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This
+// function is not thread-safe.
+void MotionVectorSearchInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
+#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc
new file mode 100644
index 0000000..8ce23b4
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.cc
@@ -0,0 +1,578 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/obmc.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
+    const __m128i obmc_pred_val =
+        Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride);
+
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store2(pred, packed_result);
+    pred += prediction_stride;
+    const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  // Duplicate first half of vector.
+  const __m128i masks =
+      _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
+  int y = height;
+  do {
+    const __m128i pred_val0 = Load4(pred);
+    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+
+    // Place the second row of each source in the second four bytes.
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i obmc_pred_val = _mm_alignr_epi8(
+        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    const int second_row_result = _mm_extract_epi32(packed_result, 1);
+    memcpy(pred, &second_row_result, sizeof(second_row_result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft8xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const __m128i mask_val = LoadLo8(kObmcMask + 6);
+  // 64 - mask
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+  int y = height;
+  do {
+    const __m128i pred_val = LoadLo8(pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    StoreLo8(pred, _mm_packus_epi16(result, result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (--y != 0);
+}
+
+void OverlapBlendFromLeft_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                   obmc_prediction_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint8_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadUnaligned16(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
+
+    int y = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
+      StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
+
+      pred += prediction_stride;
+      obmc_pred += obmc_prediction_stride;
+    } while (++y < height);
+    x += 16;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(
+        _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
+        mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i pred_val0 = Load4(pred);
+
+    const __m128i obmc_pred_val0 = Load4(obmc_pred);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    const __m128i pred_val =
+        _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
+    const __m128i obmc_pred_val = _mm_alignr_epi8(
+        Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12);
+    const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    const __m128i packed_result = _mm_packus_epi16(result, result);
+    Store4(pred - prediction_stride, packed_result);
+    Store4(pred, _mm_srli_si128(packed_result, 4));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+    y += 2;
+  } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop8xH_SSE4_1(
+    uint8_t* LIBGAV1_RESTRICT const prediction,
+    const ptrdiff_t prediction_stride, const int height,
+    const uint8_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  uint8_t* pred = prediction;
+  const uint8_t* obmc_pred = obmc_prediction;
+  const uint8_t* mask = kObmcMask + height - 2;
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  int y = compute_height;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i pred_val = LoadLo8(pred);
+    const __m128i obmc_pred_val = LoadLo8(obmc_pred);
+    const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+    const __m128i result =
+        RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
+
+    StoreLo8(pred, _mm_packus_epi16(result, result));
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (--y != 0);
+}
+
+void OverlapBlendFromTop_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint8_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                  obmc_prediction_stride);
+    return;
+  }
+  if (width == 8) {
+    OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred,
+                                  obmc_prediction_stride);
+    return;
+  }
+
+  // Stop when mask value becomes 64.
+  const int compute_height = height - (height >> 2);
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  int y = 0;
+  const uint8_t* mask = kObmcMask + height - 2;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
+      const __m128i result_lo =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
+      const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
+      const __m128i result_hi =
+          RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
+      StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
+      x += 16;
+    } while (x < width);
+    pred += prediction_stride;
+    obmc_pred += obmc_prediction_stride;
+  } while (++y < compute_height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+    const __m128i obmc_pred_val =
+        Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+    const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i result = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result, result);
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+  assert(width >= 2);
+  assert(height >= 4);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint16_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadLo8(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int y = height;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+      pred += pred_stride;
+      obmc_pred += obmc_pred_stride;
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(
+    uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
+    const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(
+    void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
+    const int width, const int height,
+    const void* LIBGAV1_RESTRICT const obmc_prediction,
+    const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+  assert(width >= 4);
+  assert(height >= 2);
+
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                  obmc_pred_stride);
+    return;
+  }
+
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  const uint8_t* mask = kObmcMask + height - 2;
+  pred = static_cast<uint16_t*>(prediction);
+  obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  int y = 0;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+      x += 8;
+    } while (x < width);
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  } while (++y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void ObmcInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h
new file mode 100644
index 0000000..448d2cf
--- /dev/null
+++ b/src/dsp/x86/obmc_sse4.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::obmc_blend[]. This function is not thread-safe.
+void ObmcInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
+#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
+#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc
new file mode 100644
index 0000000..458d94e
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.cc
@@ -0,0 +1,323 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+    kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, -128, 0, 0, 0, 0},       {0, 0, 1, -128, -2, 1, 0, 0},
+        {0, -1, 3, -127, -4, 2, -1, 0},    {0, -1, 4, -127, -6, 3, -1, 0},
+        {0, -2, 6, -126, -8, 3, -1, 0},    {0, -2, 7, -125, -11, 4, -1, 0},
+        {1, -2, 8, -125, -13, 5, -2, 0},   {1, -3, 9, -124, -15, 6, -2, 0},
+        {1, -3, 10, -123, -18, 6, -2, 1},  {1, -3, 11, -122, -20, 7, -3, 1},
+        {1, -4, 12, -121, -22, 8, -3, 1},  {1, -4, 13, -120, -25, 9, -3, 1},
+        {1, -4, 14, -118, -28, 9, -3, 1},  {1, -4, 15, -117, -30, 10, -4, 1},
+        {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+        {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+        {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+        {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+        {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+        {1, -6, 20, -97, -58, 17, -6, 1},  {1, -6, 20, -95, -61, 18, -6, 1},
+        {2, -7, 20, -93, -64, 18, -6, 2},  {2, -7, 20, -91, -66, 19, -6, 1},
+        {2, -7, 20, -88, -69, 19, -6, 1},  {2, -7, 20, -86, -71, 19, -6, 1},
+        {2, -7, 20, -84, -74, 20, -7, 2},  {2, -7, 20, -81, -76, 20, -7, 1},
+        {2, -7, 20, -79, -79, 20, -7, 2},  {1, -7, 20, -76, -81, 20, -7, 2},
+        {2, -7, 20, -74, -84, 20, -7, 2},  {1, -6, 19, -71, -86, 20, -7, 2},
+        {1, -6, 19, -69, -88, 20, -7, 2},  {1, -6, 19, -66, -91, 20, -7, 2},
+        {2, -6, 18, -64, -93, 20, -7, 2},  {1, -6, 18, -61, -95, 20, -6, 1},
+        {1, -6, 17, -58, -97, 20, -6, 1},  {1, -6, 17, -56, -99, 20, -6, 1},
+        {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+        {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+        {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+        {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+        {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+        {1, -3, 9, -28, -118, 14, -4, 1},  {1, -3, 9, -25, -120, 13, -4, 1},
+        {1, -3, 8, -22, -121, 12, -4, 1},  {1, -3, 7, -20, -122, 11, -3, 1},
+        {1, -2, 6, -18, -123, 10, -3, 1},  {0, -2, 6, -15, -124, 9, -3, 1},
+        {0, -2, 5, -13, -125, 8, -2, 1},   {0, -1, 4, -11, -125, 7, -2, 0},
+        {0, -1, 3, -8, -126, 6, -2, 0},    {0, -1, 3, -6, -127, 4, -1, 0},
+        {0, -1, 2, -4, -127, 3, -1, 0},    {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 4);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 16) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      remainder = subpixel_x & kSuperResScaleMask;
+      filter = LoadHi8(filter,
+                       kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+                     void* LIBGAV1_RESTRICT const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* LIBGAV1_RESTRICT const dest,
+                     const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 15 extra upscaled pixels which will
+    // over-read up to 15 downscaled pixels in the end of each row.
+    // kSuperResHorizontalPadding protects this behavior from segmentation
+    // faults and threading issues.
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 16) {
+        // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+        // It's fine to write uninitialized bytes outside the frame, but the
+        // inside-frame pixels are incorrectly labeled uninitialized if
+        // uninitialized values go through the hadd intrinsics.
+        // |src| is offset 4 pixels to the left, and there are 4 extended border
+        // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+        // bytes. A difference of 1 indicates 7 good bytes.
+        const int msan_bytes_lo =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        __m128i s =
+            LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
+        subpixel_x += step;
+        const int msan_bytes_hi =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+                        msan_bytes_hi);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_maddubs_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+      Transpose2x16_U16(a, a);
+      a[0] = _mm_adds_epi16(a[0], a[1]);
+      a[1] = _mm_adds_epi16(a[2], a[3]);
+      const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+      a[0] = _mm_subs_epi16(rounding, a[0]);
+      a[1] = _mm_subs_epi16(rounding, a[1]);
+      a[0] = _mm_srai_epi16(a[0], kFilterBits);
+      a[1] = _mm_srai_epi16(a[1], kFilterBits);
+      StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
+alignas(16) const int16_t
+    kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},        {0, 0, -1, 128, 2, -1, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},      {0, 1, -4, 127, 6, -3, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},      {0, 2, -7, 125, 11, -4, 1, 0},
+        {-1, 2, -8, 125, 13, -5, 2, 0},    {-1, 3, -9, 124, 15, -6, 2, 0},
+        {-1, 3, -10, 123, 18, -6, 2, -1},  {-1, 3, -11, 122, 20, -7, 3, -1},
+        {-1, 4, -12, 121, 22, -8, 3, -1},  {-1, 4, -13, 120, 25, -9, 3, -1},
+        {-1, 4, -14, 118, 28, -9, 3, -1},  {-1, 4, -15, 117, 30, -10, 4, -1},
+        {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
+        {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
+        {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
+        {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
+        {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
+        {-1, 6, -20, 97, 58, -17, 6, -1},  {-1, 6, -20, 95, 61, -18, 6, -1},
+        {-2, 7, -20, 93, 64, -18, 6, -2},  {-2, 7, -20, 91, 66, -19, 6, -1},
+        {-2, 7, -20, 88, 69, -19, 6, -1},  {-2, 7, -20, 86, 71, -19, 6, -1},
+        {-2, 7, -20, 84, 74, -20, 7, -2},  {-2, 7, -20, 81, 76, -20, 7, -1},
+        {-2, 7, -20, 79, 79, -20, 7, -2},  {-1, 7, -20, 76, 81, -20, 7, -2},
+        {-2, 7, -20, 74, 84, -20, 7, -2},  {-1, 6, -19, 71, 86, -20, 7, -2},
+        {-1, 6, -19, 69, 88, -20, 7, -2},  {-1, 6, -19, 66, 91, -20, 7, -2},
+        {-2, 6, -18, 64, 93, -20, 7, -2},  {-1, 6, -18, 61, 95, -20, 6, -1},
+        {-1, 6, -17, 58, 97, -20, 6, -1},  {-1, 6, -17, 56, 99, -20, 6, -1},
+        {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
+        {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
+        {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
+        {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
+        {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
+        {-1, 3, -9, 28, 118, -14, 4, -1},  {-1, 3, -9, 25, 120, -13, 4, -1},
+        {-1, 3, -8, 22, 121, -12, 4, -1},  {-1, 3, -7, 20, 122, -11, 3, -1},
+        {-1, 2, -6, 18, 123, -10, 3, -1},  {0, 2, -6, 15, 124, -9, 3, -1},
+        {0, 2, -5, 13, 125, -8, 2, -1},    {0, 1, -4, 11, 125, -7, 2, 0},
+        {0, 1, -3, 8, 126, -6, 2, 0},      {0, 1, -3, 6, 127, -4, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},      {0, 0, -1, 2, 128, -1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 8) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* LIBGAV1_RESTRICT const coefficients,
+                     void* LIBGAV1_RESTRICT const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* LIBGAV1_RESTRICT const dest,
+                     const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 8) {
+        const __m128i s =
+            LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_madd_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+      a[0] = _mm_hadd_epi32(a[0], a[1]);
+      a[1] = _mm_hadd_epi32(a[2], a[3]);
+      a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+      a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(
+          _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+      StoreAligned16(dst_ptr, clipped_16);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+  static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+  static_cast<void>(SuperRes_SSE4_1);
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h
new file mode 100644
index 0000000..07a7ef4
--- /dev/null
+++ b/src/dsp/x86/super_res_sse4.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res_row. This function is not thread-safe.
+void SuperResInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h
new file mode 100644
index 0000000..9726495
--- /dev/null
+++ b/src/dsp/x86/transpose_sse4.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <emmintrin.h>
+
+namespace libgav1 {
+namespace dsp {
+
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+                                             __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]:  00 01 10 11  20 21 30 31
+  // in[1]:  40 41 50 51  60 61 70 71
+  // in[2]:  80 81 90 91  a0 a1 b0 b1
+  // in[3]:  c0 c1 d0 d1  e0 e1 f0 f1
+  // to:
+  // a0:     00 40 01 41  10 50 11 51
+  // a1:     20 60 21 61  30 70 31 71
+  // a2:     80 c0 81 c1  90 d0 91 d1
+  // a3:     a0 e0 a1 e1  b0 f0 b1 f1
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+  // b0:     00 20 40 60  01 21 41 61
+  // b1:     10 30 50 70  11 31 51 71
+  // b2:     80 a0 c0 e0  81 a1 c1 e1
+  // b3:     90 b0 d0 f0  91 b1 d1 f1
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 80 90 a0 b0  c0 d0 e0 f0
+  // out[3]: 81 91 a1 b1  c1 d1 e1 f1
+  out[0] = _mm_unpacklo_epi16(b0, b1);
+  out[1] = _mm_unpackhi_epi16(b0, b1);
+  out[2] = _mm_unpacklo_epi16(b2, b3);
+  out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in,
+                                                 __m128i* out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]:  00 01 02 03 04 05 06 07
+  // in[1]:  10 11 12 13 14 15 16 17
+  // in[2]:  20 21 22 23 24 25 26 27
+  // in[3]:  30 31 32 33 34 35 36 37
+  // in[4]:  40 41 42 43 44 45 46 47
+  // in[5]:  50 51 52 53 54 55 56 57
+  // in[6]:  60 61 62 63 64 65 66 67
+  // in[7]:  70 71 72 73 74 75 76 77
+  // to:
+  // a0:     00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:     20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:     40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:     60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // b0:     00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  // b1:     40 50 60 70  41 51 61 71  42 52 62 72  43 53 63 73
+  // b2:     04 14 24 34  05 15 25 35  06 16 26 36  07 17 27 37
+  // b3:     44 54 64 74  45 55 65 75  46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // out[0]: 00 10 20 30  40 50 60 70  01 11 21 31  41 51 61 71
+  // out[1]: 02 12 22 32  42 52 62 72  03 13 23 33  43 53 63 73
+  // out[2]: 04 14 24 34  44 54 64 74  05 15 25 35  45 55 65 75
+  // out[3]: 06 16 26 36  46 56 66 76  07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi32(b0, b1);
+  out[1] = _mm_unpackhi_epi32(b0, b1);
+  out[2] = _mm_unpacklo_epi32(b2, b3);
+  out[3] = _mm_unpackhi_epi32(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // ba:    00 10 01 11  02 12 03 13
+  // dc:    20 30 21 31  22 32 23 33
+  const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+  // Unpack 32 bit elements resulting in:
+  // dcba_lo: 00 10 20 30  01 11 21 31
+  // dcba_hi: 02 12 22 32  03 13 23 33
+  const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+  const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+  // Assign or shift right by 8 bytes resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  03 13 23 33
+  // out[3]: 03 13 23 33  XX XX XX XX
+  out[0] = dcba_lo;
+  out[1] = _mm_srli_si128(dcba_lo, 8);
+  out[2] = dcba_hi;
+  out[3] = _mm_srli_si128(dcba_hi, 8);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in,
+                                                 __m128i* out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in,
+                                            __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+#endif  // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc
new file mode 100644
index 0000000..5830894
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.cc
@@ -0,0 +1,535 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/warp.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+// Number of extra bits of precision in warped filtering.
+constexpr int kWarpedDiffPrecisionBits = 10;
+
+// This assumes the two filters contain filter[x] and filter[x+2].
+inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
+                                const __m128i filter_1,
+                                const __m128i& src_window) {
+  const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
+  const __m128i src =
+      _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
+  return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
+}
+
+constexpr int kFirstPassOffset = 1 << 14;
+constexpr int kOffsetRemoval =
+    (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
+
+// Applies the horizontal filter to one source row and stores the result in
+// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
+// |intermediate_result| two-dimensional array.
+inline void HorizontalFilter(const int sx4, const int16_t alpha,
+                             const __m128i src_row,
+                             int16_t intermediate_result_row[8]) {
+  int sx = sx4 - MultiplyBy4(alpha);
+  __m128i filter[8];
+  for (__m128i& f : filter) {
+    const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
+                       kWarpedPixelPrecisionShifts;
+    f = LoadLo8(kWarpedFilters8[offset]);
+    sx += alpha;
+  }
+  Transpose8x8To4x16_U8(filter, filter);
+  // |filter| now contains two filters per register.
+  // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
+  // without overflowing the sign bit. The sign bit is hit only where two taps
+  // paired in a single madd add up to more than 128. This is only possible with
+  // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
+  // even guarantees safety. |sum| is given a negative offset to allow for large
+  // intermediate values.
+  // k = 0, 2.
+  __m128i src_row_window = src_row;
+  __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
+  sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
+
+  // k = 1, 3.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
+                         _mm_srli_si128(filter[1], 8), src_row_window);
+  // k = 4, 6.
+  src_row_window = _mm_srli_si128(src_row_window, 3);
+  sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
+
+  // k = 5, 7.
+  src_row_window = _mm_srli_si128(src_row_window, 1);
+  sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
+                         _mm_srli_si128(filter[3], 8), src_row_window);
+
+  sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
+  StoreUnaligned16(intermediate_result_row, sum);
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t intermediate_result[15][8], int y,
+                                void* LIBGAV1_RESTRICT dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
+  __m128i sum_high = sum_low;
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
+    const __m128i intermediate_1 =
+        LoadUnaligned16(intermediate_result[y + k + 1]);
+    const __m128i intermediate_low =
+        _mm_unpacklo_epi16(intermediate_0, intermediate_1);
+    const __m128i intermediate_high =
+        _mm_unpackhi_epi16(intermediate_0, intermediate_1);
+
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
+    const __m128i product_high =
+        _mm_madd_epi16(filters_high, intermediate_high);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound>
+inline void WriteVerticalFilter(const __m128i filter[8],
+                                const int16_t* LIBGAV1_RESTRICT
+                                    intermediate_result_column,
+                                void* LIBGAV1_RESTRICT dst_row) {
+  constexpr int kRoundBitsVertical =
+      is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
+  __m128i sum_low = _mm_setzero_si128();
+  __m128i sum_high = _mm_setzero_si128();
+  for (int k = 0; k < 8; k += 2) {
+    const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
+    const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
+    // Equivalent to unpacking two vectors made by duplicating int16_t values.
+    const __m128i intermediate =
+        _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
+                       intermediate_result_column[k]);
+    const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
+    const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
+    sum_low = _mm_add_epi32(sum_low, product_low);
+    sum_high = _mm_add_epi32(sum_high, product_high);
+  }
+  sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
+  sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
+  if (is_compound) {
+    const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
+    StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
+  } else {
+    const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
+    StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
+                           int delta, DestType* LIBGAV1_RESTRICT dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void VerticalFilter(const int16_t* LIBGAV1_RESTRICT source_cols, int y4,
+                           int gamma, int delta,
+                           DestType* LIBGAV1_RESTRICT dest_row,
+                           ptrdiff_t dest_stride) {
+  int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+  for (int y = 0; y < 8; ++y) {
+    int sy = sy4 - MultiplyBy4(gamma);
+    __m128i filter[8];
+    for (__m128i& f : filter) {
+      const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
+                         kWarpedPixelPrecisionShifts;
+      f = LoadUnaligned16(kWarpedFilters[offset]);
+      sy += gamma;
+    }
+    Transpose8x8_U16(filter, filter);
+    WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
+    dest_row += dest_stride;
+    sy4 += delta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion1(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_width,
+                        int source_height, int ix4, int iy4,
+                        DestType* LIBGAV1_RESTRICT dst_row,
+                        ptrdiff_t dest_stride) {
+  // Region 1
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Region 1.
+  // Every sample used to calculate the prediction block has the same
+  // value. So the whole prediction block has the same value.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t row_border_pixel = first_row_border[row * source_stride];
+
+  if (is_compound) {
+    const __m128i sum =
+        _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
+                                            kInterRoundBitsCompoundVertical));
+    StoreUnaligned16(dst_row, sum);
+  } else {
+    memset(dst_row, row_border_pixel, 8);
+  }
+  const DestType* const first_dst_row = dst_row;
+  dst_row += dest_stride;
+  for (int y = 1; y < 8; ++y) {
+    memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
+    dst_row += dest_stride;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion2(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_width, int y4,
+                        int ix4, int iy4, int gamma, int delta,
+                        int16_t intermediate_result_column[15],
+                        DestType* LIBGAV1_RESTRICT dst_row,
+                        ptrdiff_t dest_stride) {
+  // Region 2.
+  // Points to the left or right border of the first row of |src|.
+  const uint8_t* first_row_border =
+      (ix4 + 7 <= 0) ? src : src + source_width - 1;
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+
+  // Region 2.
+  // Horizontal filter.
+  // The input values in this region are generated by extending the border
+  // which makes them identical in the horizontal direction. This
+  // computation could be inlined in the vertical pass but most
+  // implementations will need a transpose of some sort.
+  // It is not necessary to use the offset values here because the
+  // horizontal pass is a simple shift and the vertical pass will always
+  // require using 32 bits.
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    int sum = first_row_border[row * source_stride];
+    sum <<= (kFilterBits - kInterRoundBitsHorizontal);
+    intermediate_result_column[y + 7] = sum;
+  }
+  // Region 2 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
+                                        delta, dst_row, dest_stride);
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion3(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int source_height, int alpha,
+                        int beta, int x4, int ix4, int iy4,
+                        int16_t intermediate_result[15][8]) {
+  // Region 3
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+  const uint8_t* const src_row = src + row * source_stride;
+  // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+  // read but is ignored.
+  //
+  // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+  // bytes after src_row[source_width - 1]. We assume the source frame
+  // has left and right borders of at least 13 bytes that extend the
+  // frame boundary pixels. We also assume there is at least one extra
+  // padding byte after the right border of the last source row.
+  const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void WarpRegion4(const uint8_t* LIBGAV1_RESTRICT src,
+                        ptrdiff_t source_stride, int alpha, int beta, int x4,
+                        int ix4, int iy4, int16_t intermediate_result[15][8]) {
+  // Region 4.
+  // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
+
+  // In general, for y in [-7, 8), the row number iy4 + y is clipped:
+  //   const int row = Clip3(iy4 + y, 0, source_height - 1);
+  // In two special cases, iy4 + y is clipped to either 0 or
+  // source_height - 1 for all y. In the rest of the cases, iy4 + y is
+  // bounded and we can avoid clipping iy4 + y by relying on a reference
+  // frame's boundary extension on the top and bottom.
+  // Horizontal filter.
+  int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+  for (int y = -7; y < 8; ++y) {
+    // We may over-read up to 13 pixels above the top source row, or up
+    // to 13 pixels below the bottom source row. This is proved in
+    // warp.cc.
+    const int row = iy4 + y;
+    const uint8_t* const src_row = src + row * source_stride;
+    // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
+    // read but is ignored.
+    //
+    // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
+    // bytes after src_row[source_width - 1]. We assume the source frame
+    // has left and right borders of at least 13 bytes that extend the
+    // frame boundary pixels. We also assume there is at least one extra
+    // padding byte after the right border of the last source row.
+    const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
+    // Convert src_row_v to int8 (subtract 128).
+    HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
+    sx4 += beta;
+  }
+}
+
+template <bool is_compound, typename DestType>
+inline void HandleWarpBlock(const uint8_t* LIBGAV1_RESTRICT src,
+                            ptrdiff_t source_stride, int source_width,
+                            int source_height,
+                            const int* LIBGAV1_RESTRICT warp_params,
+                            int subsampling_x, int subsampling_y, int src_x,
+                            int src_y, int16_t alpha, int16_t beta,
+                            int16_t gamma, int16_t delta,
+                            DestType* LIBGAV1_RESTRICT dst_row,
+                            ptrdiff_t dest_stride) {
+  union {
+    // Intermediate_result is the output of the horizontal filtering and
+    // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
+    // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
+    // type so that we can start with a negative offset and restore it on the
+    // final filter sum.
+    int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
+    // In the simple special cases where the samples in each row are all the
+    // same, store one sample per row in a column vector.
+    int16_t intermediate_result_column[15];
+  };
+
+  const int dst_x =
+      src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
+  const int dst_y =
+      src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
+  const int x4 = dst_x >> subsampling_x;
+  const int y4 = dst_y >> subsampling_y;
+  const int ix4 = x4 >> kWarpedModelPrecisionBits;
+  const int iy4 = y4 >> kWarpedModelPrecisionBits;
+  // A prediction block may fall outside the frame's boundaries. If a
+  // prediction block is calculated using only samples outside the frame's
+  // boundary, the filtering can be simplified. We can divide the plane
+  // into several regions and handle them differently.
+  //
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //         -------+-----------+-------
+  //                |***********|
+  //            2   |*****4*****|   2
+  //                |***********|
+  //         -------+-----------+-------
+  //                |           |
+  //            1   |     3     |   1
+  //                |           |
+  //
+  // At the center, region 4 represents the frame and is the general case.
+  //
+  // In regions 1 and 2, the prediction block is outside the frame's
+  // boundary horizontally. Therefore the horizontal filtering can be
+  // simplified. Furthermore, in the region 1 (at the four corners), the
+  // prediction is outside the frame's boundary both horizontally and
+  // vertically, so we get a constant prediction block.
+  //
+  // In region 3, the prediction block is outside the frame's boundary
+  // vertically. Unfortunately because we apply the horizontal filters
+  // first, by the time we apply the vertical filters, they no longer see
+  // simple inputs. So the only simplification is that all the rows are
+  // the same, but we still need to apply all the horizontal and vertical
+  // filters.
+
+  // Check for two simple special cases, where the horizontal filter can
+  // be significantly simplified.
+  //
+  // In general, for each row, the horizontal filter is calculated as
+  // follows:
+  //   for (int x = -4; x < 4; ++x) {
+  //     const int offset = ...;
+  //     int sum = first_pass_offset;
+  //     for (int k = 0; k < 8; ++k) {
+  //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
+  //       sum += kWarpedFilters[offset][k] * src_row[column];
+  //     }
+  //     ...
+  //   }
+  // The column index before clipping, ix4 + x + k - 3, varies in the range
+  // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
+  // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
+  // border index (source_width - 1 or 0, respectively). Then for each x,
+  // the inner for loop of the horizontal filter is reduced to multiplying
+  // the border pixel by the sum of the filter coefficients.
+  if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+    if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+      // Outside the frame in both directions. One repeated value.
+      WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
+                                         source_height, ix4, iy4, dst_row,
+                                         dest_stride);
+      return;
+    }
+    // Outside the frame horizontally. Rows repeated.
+    WarpRegion2<is_compound, DestType>(
+        src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
+        intermediate_result_column, dst_row, dest_stride);
+    return;
+  }
+
+  if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
+    // Outside the frame vertically.
+    WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
+                                       beta, x4, ix4, iy4, intermediate_result);
+  } else {
+    // Inside the frame.
+    WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
+                                       iy4, intermediate_result);
+  }
+  // Region 3 and 4 vertical filter.
+  VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
+                                        dst_row, dest_stride);
+}
+
+template <bool is_compound>
+void Warp_SSE4_1(const void* LIBGAV1_RESTRICT source, ptrdiff_t source_stride,
+                 int source_width, int source_height,
+                 const int* LIBGAV1_RESTRICT warp_params, int subsampling_x,
+                 int subsampling_y, int block_start_x, int block_start_y,
+                 int block_width, int block_height, int16_t alpha, int16_t beta,
+                 int16_t gamma, int16_t delta, void* LIBGAV1_RESTRICT dest,
+                 ptrdiff_t dest_stride) {
+  const auto* const src = static_cast<const uint8_t*>(source);
+  using DestType =
+      typename std::conditional<is_compound, int16_t, uint8_t>::type;
+  auto* dst = static_cast<DestType*>(dest);
+
+  // Warp process applies for each 8x8 block.
+  assert(block_width >= 8);
+  assert(block_height >= 8);
+  const int block_end_x = block_start_x + block_width;
+  const int block_end_y = block_start_y + block_height;
+
+  const int start_x = block_start_x;
+  const int start_y = block_start_y;
+  int src_x = (start_x + 4) << subsampling_x;
+  int src_y = (start_y + 4) << subsampling_y;
+  const int end_x = (block_end_x + 4) << subsampling_x;
+  const int end_y = (block_end_y + 4) << subsampling_y;
+  do {
+    DestType* dst_row = dst;
+    src_x = (start_x + 4) << subsampling_x;
+    do {
+      HandleWarpBlock<is_compound, DestType>(
+          src, source_stride, source_width, source_height, warp_params,
+          subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
+          dst_row, dest_stride);
+      src_x += (8 << subsampling_x);
+      dst_row += 8;
+    } while (src_x < end_x);
+    dst += 8 * dest_stride;
+    src_y += (8 << subsampling_y);
+  } while (src_y < end_y);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
+  dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WarpInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h
new file mode 100644
index 0000000..a2dc5ca
--- /dev/null
+++ b/src/dsp/x86/warp_sse4.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::warp. This function is not thread-safe.
+void WarpInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
+#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
+#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc
new file mode 100644
index 0000000..69cb784
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.cc
@@ -0,0 +1,1007 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/x86/weight_mask_sse4.h"
+
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+constexpr int kRoundingBits8bpp = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4(const int16_t* LIBGAV1_RESTRICT prediction_0,
+                              const int16_t* LIBGAV1_RESTRICT prediction_1,
+                              uint8_t* LIBGAV1_RESTRICT mask,
+                              ptrdiff_t mask_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i difference_0 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+  const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i difference_1 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+  const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
+  const __m128i difference_offset = _mm_set1_epi8(38);
+  const __m128i adjusted_difference =
+      _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
+                    difference_offset);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE \
+  WEIGHT8_PAIR_WITHOUT_STRIDE;  \
+  pred_0 += 8 << 1;             \
+  pred_1 += 8 << 1;             \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                        const void* LIBGAV1_RESTRICT prediction_1,
+                        uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
+
+#define WEIGHT16_AND_STRIDE \
+  WEIGHT16_WITHOUT_STRIDE;  \
+  pred_0 += 16;             \
+  pred_1 += 16;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE;
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+    WEIGHT16_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE \
+  WEIGHT32_WITHOUT_STRIDE;  \
+  pred_0 += 32;             \
+  pred_1 += 32;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                         const void* LIBGAV1_RESTRICT prediction_1,
+                         uint8_t* LIBGAV1_RESTRICT mask,
+                         ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE;
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+    WEIGHT32_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32,           \
+                                           mask + 32, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48,           \
+                                           mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE \
+  WEIGHT64_WITHOUT_STRIDE;  \
+  pred_0 += 64;             \
+  pred_1 += 64;             \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 5);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y5 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y5 < 6);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                          const void* LIBGAV1_RESTRICT prediction_1,
+                          uint8_t* LIBGAV1_RESTRICT mask,
+                          ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  do {
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+    WEIGHT64_AND_STRIDE;
+  } while (++y3 < 42);
+  WEIGHT64_AND_STRIDE;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                           const void* LIBGAV1_RESTRICT prediction_1,
+                           uint8_t* LIBGAV1_RESTRICT mask,
+                           ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 21);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                            const void* LIBGAV1_RESTRICT prediction_1,
+                            uint8_t* LIBGAV1_RESTRICT mask,
+                            ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int y3 = 0;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (++y3 < 42);
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE;
+}
+
+#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                      \
+      WeightMask##width##x##height##_SSE4<0>;                  \
+  dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1>
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4(
+    const uint16_t* LIBGAV1_RESTRICT prediction_0,
+    const uint16_t* LIBGAV1_RESTRICT prediction_1,
+    uint8_t* LIBGAV1_RESTRICT mask, ptrdiff_t mask_stride) {
+  const __m128i diff_offset = _mm_set1_epi8(38);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i zero = _mm_setzero_si128();
+
+  // Range of prediction: [3988, 61532].
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+  const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+  const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+  const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+  const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+  const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+  const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+  const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+  const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+  const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+  const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+  const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+  const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+  const __m128i adjusted_diff = _mm_adds_epu8(
+      _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+  const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                               \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                  mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 8 << 1;                   \
+  pred_1 += 8 << 1;                   \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                              const void* LIBGAV1_RESTRICT prediction_1,
+                              uint8_t* LIBGAV1_RESTRICT mask,
+                              ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                               const void* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                               const void* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                 mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+  WEIGHT16_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 16;                   \
+  pred_1 += 16;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                               const void* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE_10BPP;
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+  WEIGHT32_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 32;                   \
+  pred_1 += 32;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                               const void* LIBGAV1_RESTRICT prediction_1,
+                               uint8_t* LIBGAV1_RESTRICT mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                 mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                 mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+  WEIGHT64_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 64;                   \
+  pred_1 += 64;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                const void* LIBGAV1_RESTRICT prediction_1,
+                                uint8_t* LIBGAV1_RESTRICT mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                 const void* LIBGAV1_RESTRICT prediction_1,
+                                 uint8_t* LIBGAV1_RESTRICT mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4(const void* LIBGAV1_RESTRICT prediction_0,
+                                  const void* LIBGAV1_RESTRICT prediction_1,
+                                  uint8_t* LIBGAV1_RESTRICT mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<0>;             \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<1>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void WeightMaskInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1
diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h
new file mode 100644
index 0000000..e5d9d70
--- /dev/null
+++ b/src/dsp/x86/weight_mask_sse4.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::weight_mask. This function is not thread-safe.
+void WeightMaskInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
+#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
+#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
+#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
+#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
+#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
+#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
+#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
+#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
+#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
+#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
+#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
+#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
+#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
+#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
+#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
+#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
+#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/src/film_grain.cc b/src/film_grain.cc
new file mode 100644
index 0000000..5c64ff2
--- /dev/null
+++ b/src/film_grain.cc
@@ -0,0 +1,828 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+namespace {
+
+// The kGaussianSequence array contains random samples from a Gaussian
+// distribution with zero mean and standard deviation of about 512 clipped to
+// the range of [-2048, 2047] (representable by a signed integer using 12 bits
+// of precision) and rounded to the nearest multiple of 4.
+//
+// Note: It is important that every element in the kGaussianSequence array be
+// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
+// less than 128 for bitdepth=8 (GrainType=int8_t).
+constexpr int16_t kGaussianSequence[/*2048*/] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484};
+static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
+              "");
+
+// The number of rows in a contiguous group computed by a single worker thread
+// before checking for the next available group.
+constexpr int kFrameChunkHeight = 8;
+
+// |width| and |height| refer to the plane, not the frame, meaning any
+// subsampling should be applied by the caller.
+template <typename Pixel>
+inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
+                           int width, int height, uint8_t* dest_plane,
+                           ptrdiff_t dest_stride) {
+  // If it's the same buffer there's nothing to do.
+  if (source_plane == dest_plane) return;
+
+  int y = 0;
+  do {
+    memcpy(dest_plane, source_plane, width * sizeof(Pixel));
+    source_plane += source_stride;
+    dest_plane += dest_stride;
+  } while (++y < height);
+}
+
+}  // namespace
+
+template <int bitdepth>
+FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
+                               bool is_monochrome,
+                               bool color_matrix_is_identity, int subsampling_x,
+                               int subsampling_y, int width, int height,
+                               ThreadPool* thread_pool)
+    : params_(params),
+      is_monochrome_(is_monochrome),
+      color_matrix_is_identity_(color_matrix_is_identity),
+      subsampling_x_(subsampling_x),
+      subsampling_y_(subsampling_y),
+      width_(width),
+      height_(height),
+      template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
+                                              : kMaxChromaWidth),
+      template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
+                                               : kMaxChromaHeight),
+      thread_pool_(thread_pool) {}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::Init() {
+  // Section 7.18.3.3. Generate grain process.
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
+  // need to generate it.
+  const bool use_luma = params_.num_y_points > 0;
+  if (use_luma) {
+    GenerateLumaGrain(params_, luma_grain_);
+    // If params_.auto_regression_coeff_lag is 0, the filter is the identity
+    // filter and therefore can be skipped.
+    if (params_.auto_regression_coeff_lag > 0) {
+      dsp.film_grain
+          .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
+              params_, luma_grain_);
+    }
+  } else {
+    // Have AddressSanitizer warn if luma_grain_ is used.
+    ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
+  }
+  if (!is_monochrome_) {
+    GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
+                         u_grain_, v_grain_);
+    if (params_.auto_regression_coeff_lag > 0 || use_luma) {
+      dsp.film_grain.chroma_auto_regression[static_cast<int>(
+          use_luma)][params_.auto_regression_coeff_lag](
+          params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
+          v_grain_);
+    }
+  }
+
+  // Section 7.18.3.4. Scaling lookup initialization process.
+
+  // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
+  // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
+  // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
+  // set up as aliases. So we need to initialize scaling_lut_y_ under these
+  // two conditions.
+  //
+  // Note: Although it does not seem to make sense, there are test vectors
+  // with chroma_scaling_from_luma=true and params_.num_y_points=0.
+#if LIBGAV1_MSAN
+  // Quiet film grain / md5 msan warnings.
+  memset(scaling_lut_y_, 0, sizeof(scaling_lut_y_));
+#endif
+  if (use_luma || params_.chroma_scaling_from_luma) {
+    dsp.film_grain.initialize_scaling_lut(
+        params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
+        scaling_lut_y_, kScalingLutLength);
+  } else {
+    ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
+  }
+  if (!is_monochrome_) {
+    if (params_.chroma_scaling_from_luma) {
+      scaling_lut_u_ = scaling_lut_y_;
+      scaling_lut_v_ = scaling_lut_y_;
+    } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
+      const size_t buffer_size =
+          kScalingLutLength * (static_cast<int>(params_.num_u_points > 0) +
+                               static_cast<int>(params_.num_v_points > 0));
+      scaling_lut_chroma_buffer_.reset(new (std::nothrow) int16_t[buffer_size]);
+      if (scaling_lut_chroma_buffer_ == nullptr) return false;
+
+      int16_t* buffer = scaling_lut_chroma_buffer_.get();
+#if LIBGAV1_MSAN
+      // Quiet film grain / md5 msan warnings.
+      memset(buffer, 0, buffer_size * 2);
+#endif
+      if (params_.num_u_points > 0) {
+        scaling_lut_u_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_u_points, params_.point_u_value,
+            params_.point_u_scaling, scaling_lut_u_, kScalingLutLength);
+        buffer += kScalingLutLength;
+      }
+      if (params_.num_v_points > 0) {
+        scaling_lut_v_ = buffer;
+        dsp.film_grain.initialize_scaling_lut(
+            params_.num_v_points, params_.point_v_value,
+            params_.point_v_scaling, scaling_lut_v_, kScalingLutLength);
+      }
+    }
+  }
+  return true;
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
+                                            GrainType* luma_grain) {
+  // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
+  // the luma_grain array to all zeros. But the Note at the end of Section
+  // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
+  // call GenerateLumaGrain if params.num_y_points is equal to 0.
+  assert(params.num_y_points > 0);
+  const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+  uint16_t seed = params.grain_seed;
+  GrainType* luma_grain_row = luma_grain;
+  for (int y = 0; y < kLumaHeight; ++y) {
+    for (int x = 0; x < kLumaWidth; ++x) {
+      luma_grain_row[x] = RightShiftWithRounding(
+          kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+    }
+    luma_grain_row += kLumaWidth;
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
+                                               int chroma_width,
+                                               int chroma_height,
+                                               GrainType* u_grain,
+                                               GrainType* v_grain) {
+  const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
+  if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
+  } else {
+    uint16_t seed = params.grain_seed ^ 0xb524;
+    GrainType* u_grain_row = u_grain;
+    assert(chroma_width > 0);
+    assert(chroma_height > 0);
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        u_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      u_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+  if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
+    memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
+  } else {
+    GrainType* v_grain_row = v_grain;
+    uint16_t seed = params.grain_seed ^ 0x49d8;
+    int y = 0;
+    do {
+      int x = 0;
+      do {
+        v_grain_row[x] = RightShiftWithRounding(
+            kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
+      } while (++x < chroma_width);
+
+      v_grain_row += chroma_width;
+    } while (++y < chroma_height);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
+  const int half_height = DivideBy2(height_ + 1);
+  assert(half_height > 0);
+  // ceil(half_height / 16.0)
+  const int max_luma_num = DivideBy16(half_height + 15);
+  constexpr int kNoiseStripeHeight = 34;
+  size_t noise_buffer_size = kNoiseStripePadding;
+  if (params_.num_y_points > 0) {
+    noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_buffer_size += 2 * max_luma_num *
+                         (kNoiseStripeHeight >> subsampling_y_) *
+                         SubsampledValue(width_, subsampling_x_);
+  }
+  noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
+  if (noise_buffer_ == nullptr) return false;
+  GrainType* noise_buffer = noise_buffer_.get();
+  if (params_.num_y_points > 0) {
+    noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
+                                  noise_buffer);
+    noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
+  }
+  if (!is_monochrome_) {
+    noise_stripes_[kPlaneU].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+    noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
+                    SubsampledValue(width_, subsampling_x_);
+    noise_stripes_[kPlaneV].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
+  }
+  return true;
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AllocateNoiseImage() {
+  // When LIBGAV1_MSAN is enabled, zero initialize to quiet optimized film grain
+  // msan warnings.
+  constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
+  if (params_.num_y_points > 0 &&
+      !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
+                                   zero_initialize)) {
+    return false;
+  }
+  if (!is_monochrome_) {
+    if (!noise_image_[kPlaneU].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            zero_initialize)) {
+      return false;
+    }
+    if (!noise_image_[kPlaneV].Reset(
+            (height_ + subsampling_y_) >> subsampling_y_,
+            ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
+            zero_initialize)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Uses |overlap_flag| to skip rows that are covered by the overlap computation.
+template <int bitdepth>
+void FilmGrain<bitdepth>::ConstructNoiseImage(
+    const Array2DView<GrainType>* noise_stripes, int width, int height,
+    int subsampling_x, int subsampling_y, int stripe_start_offset,
+    Array2D<GrainType>* noise_image) {
+  const int plane_width = (width + subsampling_x) >> subsampling_x;
+  const int plane_height = (height + subsampling_y) >> subsampling_y;
+  const int stripe_height = 32 >> subsampling_y;
+  const int stripe_mask = stripe_height - 1;
+  int y = 0;
+  // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
+  // to either 16 or 32.
+  const GrainType* first_noise_stripe = (*noise_stripes)[0];
+  do {
+    memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
+           plane_width * sizeof(first_noise_stripe[0]));
+  } while (++y < std::min(stripe_height, plane_height));
+  // End special iterations for luma_num == 0.
+
+  int luma_num = 1;
+  for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < stripe_height);
+  }
+
+  // If there is a partial stripe, copy any rows beyond the overlap rows.
+  const int remaining_height = plane_height - y;
+  if (remaining_height > stripe_start_offset) {
+    assert(luma_num < noise_stripes->rows());
+    const GrainType* noise_stripe = (*noise_stripes)[luma_num];
+    int i = stripe_start_offset;
+    do {
+      memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
+             plane_width * sizeof(noise_stripe[0]));
+    } while (++i < remaining_height);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
+    const dsp::Dsp& dsp, const Plane* planes, int num_planes,
+    std::atomic<int>* job_counter, int min_value, int max_chroma,
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
+    ptrdiff_t dest_stride_uv) {
+  assert(num_planes > 0);
+  const int full_jobs_per_plane = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_full_jobs = full_jobs_per_plane * num_planes;
+  // If the frame height is not a multiple of kFrameChunkHeight, one job with
+  // a smaller number of rows is necessary at the end of each plane.
+  const int total_jobs =
+      total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
+  int job_index;
+  // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
+  // plane. dsp->blend_noise_chroma handles subsampling.
+  // This loop body handles a slice of one plane or the other, depending on
+  // which are active. That way, threads working on consecutive jobs will keep
+  // the same region of luma source in working memory.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const Plane plane = planes[job_index % num_planes];
+    const int slice_index = job_index / num_planes;
+    const int start_height = slice_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    const int16_t* scaling_lut_uv;
+    const uint8_t* source_plane_uv;
+    uint8_t* dest_plane_uv;
+
+    if (plane == kPlaneU) {
+      scaling_lut_uv = scaling_lut_u_;
+      source_plane_uv = source_plane_u;
+      dest_plane_uv = dest_plane_u;
+    } else {
+      assert(plane == kPlaneV);
+      scaling_lut_uv = scaling_lut_v_;
+      source_plane_uv = source_plane_v;
+      dest_plane_uv = dest_plane_v;
+    }
+    const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
+        source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
+    auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
+        dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
+    dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+        plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
+        start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
+        source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
+        dest_cursor_uv, dest_stride_uv);
+  }
+}
+
+template <int bitdepth>
+void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
+    const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
+    int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
+  const int total_full_jobs = height_ / kFrameChunkHeight;
+  const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
+  const int total_jobs =
+      total_full_jobs + static_cast<int>(remainder_job_height > 0);
+  int job_index;
+  // Each job is some number of rows in a plane.
+  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
+         total_jobs) {
+    const int start_height = job_index * kFrameChunkHeight;
+    const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
+
+    const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
+        source_plane_y + start_height * source_stride_y);
+    auto* dest_cursor_y =
+        reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
+    dsp.film_grain.blend_noise_luma(
+        noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+        job_height, start_height, scaling_lut_y_, source_cursor_y,
+        source_stride_y, dest_cursor_y, dest_stride_y);
+  }
+}
+
+template <int bitdepth>
+bool FilmGrain<bitdepth>::AddNoise(
+    const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+    const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+    ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
+    uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
+  if (!Init()) {
+    LIBGAV1_DLOG(ERROR, "Init() failed.");
+    return false;
+  }
+  if (!AllocateNoiseStripes()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
+    return false;
+  }
+
+  const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
+  const bool use_luma = params_.num_y_points > 0;
+
+  // Construct noise stripes.
+  if (use_luma) {
+    // The luma plane is never subsampled.
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            luma_grain_, params_.grain_seed, width_, height_,
+            /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
+  }
+  if (!is_monochrome_) {
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneU]);
+    dsp.film_grain
+        .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
+            v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
+            subsampling_y_, &noise_stripes_[kPlaneV]);
+  }
+
+  if (!AllocateNoiseImage()) {
+    LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
+    return false;
+  }
+
+  // Construct noise image.
+  if (use_luma) {
+    ConstructNoiseImage(
+        &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+        /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
+        &noise_image_[kPlaneY]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
+          /*subsampling_y=*/0, &noise_image_[kPlaneY]);
+    }
+  }
+  if (!is_monochrome_) {
+    ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneU]);
+    ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
+                        subsampling_x_, subsampling_y_,
+                        static_cast<int>(params_.overlap_flag)
+                            << (1 - subsampling_y_),
+                        &noise_image_[kPlaneV]);
+    if (params_.overlap_flag) {
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneU]);
+      dsp.film_grain.construct_noise_image_overlap(
+          &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
+          subsampling_y_, &noise_image_[kPlaneV]);
+    }
+  }
+
+  // Blend noise image.
+  int min_value;
+  int max_luma;
+  int max_chroma;
+  if (params_.clip_to_restricted_range) {
+    min_value = 16 << (bitdepth - kBitdepth8);
+    max_luma = 235 << (bitdepth - kBitdepth8);
+    if (color_matrix_is_identity_) {
+      max_chroma = max_luma;
+    } else {
+      max_chroma = 240 << (bitdepth - kBitdepth8);
+    }
+  } else {
+    min_value = 0;
+    max_luma = (256 << (bitdepth - kBitdepth8)) - 1;
+    max_chroma = max_luma;
+  }
+
+  // Handle all chroma planes first because luma source may be altered in place.
+  if (!is_monochrome_) {
+    // This is done in a strange way but Vector can't be passed by copy to the
+    // lambda capture that spawns the thread.
+    Plane planes_to_blend[2];
+    int num_planes = 0;
+    if (params_.chroma_scaling_from_luma) {
+      // Both noise planes are computed from the luma scaling lookup table.
+      planes_to_blend[num_planes++] = kPlaneU;
+      planes_to_blend[num_planes++] = kPlaneV;
+    } else {
+      const int height_uv = SubsampledValue(height_, subsampling_y_);
+      const int width_uv = SubsampledValue(width_, subsampling_x_);
+
+      // Noise is applied according to a lookup table defined by pieceiwse
+      // linear "points." If the lookup table is empty, that corresponds to
+      // outputting zero noise.
+      if (params_.num_u_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
+                              height_uv, dest_plane_u, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneU;
+      }
+      if (params_.num_v_points == 0) {
+        CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
+                              height_uv, dest_plane_v, dest_stride_uv);
+      } else {
+        planes_to_blend[num_planes++] = kPlaneV;
+      }
+    }
+    if (thread_pool_ != nullptr && num_planes > 0) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
+                                num_planes, &job_counter, min_value, max_chroma,
+                                source_plane_y, source_stride_y, source_plane_u,
+                                source_plane_v, source_stride_uv, dest_plane_u,
+                                dest_plane_v, dest_stride_uv]() {
+          BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
+                                 min_value, max_chroma, source_plane_y,
+                                 source_stride_y, source_plane_u,
+                                 source_plane_v, source_stride_uv, dest_plane_u,
+                                 dest_plane_v, dest_stride_uv);
+          pending_workers.Decrement();
+        });
+      }
+      BlendNoiseChromaWorker(
+          dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
+          source_plane_y, source_stride_y, source_plane_u, source_plane_v,
+          source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
+
+      pending_workers.Wait();
+    } else {
+      // Single threaded.
+      if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
+            source_stride_uv, dest_plane_u, dest_stride_uv);
+      }
+      if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
+        dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
+            kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
+            height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
+            scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
+            source_stride_uv, dest_plane_v, dest_stride_uv);
+      }
+    }
+  }
+  if (use_luma) {
+    if (thread_pool_ != nullptr) {
+      const int num_workers = thread_pool_->num_threads();
+      BlockingCounter pending_workers(num_workers);
+      std::atomic<int> job_counter(0);
+      for (int i = 0; i < num_workers; ++i) {
+        thread_pool_->Schedule(
+            [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
+             source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
+              BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                                   source_plane_y, source_stride_y,
+                                   dest_plane_y, dest_stride_y);
+              pending_workers.Decrement();
+            });
+      }
+
+      BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
+                           source_plane_y, source_stride_y, dest_plane_y,
+                           dest_stride_y);
+      pending_workers.Wait();
+    } else {
+      dsp.film_grain.blend_noise_luma(
+          noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
+          height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
+          source_stride_y, dest_plane_y, dest_stride_y);
+    }
+  } else {
+    CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
+                          dest_plane_y, dest_stride_y);
+  }
+
+  return true;
+}
+
+// Explicit instantiations.
+template class FilmGrain<kBitdepth8>;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template class FilmGrain<kBitdepth10>;
+#endif
+
+}  // namespace libgav1
diff --git a/src/film_grain.h b/src/film_grain.h
new file mode 100644
index 0000000..f2c1e93
--- /dev/null
+++ b/src/film_grain.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FILM_GRAIN_H_
+#define LIBGAV1_SRC_FILM_GRAIN_H_
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// Film grain synthesis function signature. Section 7.18.3.
+// This function generates film grain noise and blends the noise with the
+// decoded frame.
+// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane
+// buffers of the decoded frame. They are blended with the film grain noise and
+// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final
+// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or
+// v) may point to the same buffer, in which case the film grain noise is added
+// in place.
+// |film_grain_params| are parameters read from frame header.
+// |is_monochrome| is true indicates only Y plane needs to be processed.
+// |color_matrix_is_identity| is true if the matrix_coefficients field in the
+// sequence header's color config is is MC_IDENTITY.
+// |width| is the upscaled width of the frame.
+// |height| is the frame height.
+// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used
+// if |is_monochrome| is true.
+// Returns true on success, or false on failure (e.g., out of memory).
+using FilmGrainSynthesisFunc = bool (*)(
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_u, ptrdiff_t source_stride_u,
+    const void* source_plane_v, ptrdiff_t source_stride_v,
+    const FilmGrainParams& film_grain_params, bool is_monochrome,
+    bool color_matrix_is_identity, int width, int height, int subsampling_x,
+    int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y,
+    void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v,
+    ptrdiff_t dest_stride_v);
+
+// Section 7.18.3.5. Add noise synthesis process.
+template <int bitdepth>
+class FilmGrain {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  FilmGrain(const FilmGrainParams& params, bool is_monochrome,
+            bool color_matrix_is_identity, int subsampling_x, int subsampling_y,
+            int width, int height, ThreadPool* thread_pool);
+
+  // Note: These static methods are declared public so that the unit tests can
+  // call them.
+
+  static void GenerateLumaGrain(const FilmGrainParams& params,
+                                GrainType* luma_grain);
+
+  // Generates white noise arrays u_grain and v_grain chroma_width samples wide
+  // and chroma_height samples high.
+  static void GenerateChromaGrains(const FilmGrainParams& params,
+                                   int chroma_width, int chroma_height,
+                                   GrainType* u_grain, GrainType* v_grain);
+
+  // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are
+  // subject to overlap.
+  static void ConstructNoiseImage(const Array2DView<GrainType>* noise_stripes,
+                                  int width, int height, int subsampling_x,
+                                  int subsampling_y, int stripe_start_offset,
+                                  Array2D<GrainType>* noise_image);
+
+  // Combines the film grain with the image data.
+  bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
+                const uint8_t* source_plane_u, const uint8_t* source_plane_v,
+                ptrdiff_t source_stride_uv, uint8_t* dest_plane_y,
+                ptrdiff_t dest_stride_y, uint8_t* dest_plane_u,
+                uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+ private:
+  using Pixel =
+      typename std::conditional<bitdepth == 8, uint8_t, uint16_t>::type;
+  static constexpr int kScalingLutLength =
+      (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+
+  bool Init();
+
+  // Allocates noise_stripes_.
+  bool AllocateNoiseStripes();
+
+  bool AllocateNoiseImage();
+
+  void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes,
+                              int num_planes, std::atomic<int>* job_counter,
+                              int min_value, int max_chroma,
+                              const uint8_t* source_plane_y,
+                              ptrdiff_t source_stride_y,
+                              const uint8_t* source_plane_u,
+                              const uint8_t* source_plane_v,
+                              ptrdiff_t source_stride_uv, uint8_t* dest_plane_u,
+                              uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv);
+
+  void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic<int>* job_counter,
+                            int min_value, int max_luma,
+                            const uint8_t* source_plane_y,
+                            ptrdiff_t source_stride_y, uint8_t* dest_plane_y,
+                            ptrdiff_t dest_stride_y);
+
+  const FilmGrainParams& params_;
+  const bool is_monochrome_;
+  const bool color_matrix_is_identity_;
+  const int subsampling_x_;
+  const int subsampling_y_;
+  // Frame width and height.
+  const int width_;
+  const int height_;
+  // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are
+  // known as CbGrain and CrGrain.
+  // These templates are used to construct the noise image for each plane by
+  // copying 32x32 blocks with pseudorandom offsets, into "noise stripes."
+  // The noise template known as LumaGrain array is an 82x73 block.
+  // The height and width of the templates for chroma become 44 and 38 under
+  // subsampling, respectively.
+  //  For more details see:
+  // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018
+  // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12.
+  const int template_uv_width_;
+  const int template_uv_height_;
+  // LumaGrain. The luma_grain array contains white noise generated for luma.
+  // The array size is fixed but subject to further optimization for SIMD.
+  GrainType luma_grain_[kLumaHeight * kLumaWidth];
+  // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is
+  // kMaxChromaHeight * kMaxChromaWidth. The actual size is
+  // template_uv_height_ * template_uv_width_.
+  GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth];
+  // Scaling lookup tables.
+  int16_t scaling_lut_y_[kScalingLutLength];
+  int16_t* scaling_lut_u_ = nullptr;
+  int16_t* scaling_lut_v_ = nullptr;
+  // If allocated, this buffer is 256 * 2 values long and scaling_lut_u_ and
+  // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and
+  // scaling_lut_v_ point to scaling_lut_y_.
+  std::unique_ptr<int16_t[]> scaling_lut_chroma_buffer_;
+
+  // A two-dimensional array of noise data for each plane. Generated for each 32
+  // luma sample high stripe of the image. The first dimension is called
+  // luma_num. The second dimension is the size of one noise stripe.
+  //
+  // Each row of the Array2DView noise_stripes_[plane] is a conceptually
+  // two-dimensional array of |GrainType|s. The two-dimensional array of
+  // |GrainType|s is flattened into a one-dimensional buffer in this
+  // implementation.
+  //
+  // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and
+  // |width_| columns and contains noise for the luma component.
+  //
+  // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
+  // is an array that has (34 >> subsampling_y_) rows and
+  // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+  // chroma components.
+  Array2DView<GrainType> noise_stripes_[kMaxPlanes];
+  // Owns the memory that the elements of noise_stripes_ point to.
+  std::unique_ptr<GrainType[]> noise_buffer_;
+
+  Array2D<GrainType> noise_image_[kMaxPlanes];
+  ThreadPool* const thread_pool_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FILM_GRAIN_H_
diff --git a/src/film_grain_test.cc b/src/film_grain_test.cc
new file mode 100644
index 0000000..bf37299
--- /dev/null
+++ b/src/film_grain_test.cc
@@ -0,0 +1,2360 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <type_traits>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_format.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/film_grain.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+constexpr int kNumSpeedTests = 50;
+constexpr int kNumFilmGrainTestParams = 10;
+constexpr size_t kLumaBlockSize = kLumaWidth * kLumaHeight;
+constexpr size_t kChromaBlockSize = kMaxChromaWidth * kMaxChromaHeight;
+// Dimensions for unit tests concerning applying grain to the whole frame.
+constexpr size_t kNumTestStripes = 64;
+constexpr int kNoiseStripeHeight = 34;
+constexpr size_t kFrameWidth = 1921;
+constexpr size_t kFrameHeight = (kNumTestStripes - 1) * 32 + 1;
+
+/*
+  The film grain parameters for 10 frames were generated with the following
+  command line:
+  aomenc --end-usage=q --cq-level=20 --cpu-used=8 -w 1920 -h 1080 \
+    --denoise-noise-level=50 --ivf breaking_bad_21m23s_10frames.1920_1080.yuv \
+    -o breaking_bad_21m23s_10frames.1920_1080.noise50.ivf
+*/
+constexpr FilmGrainParams kFilmGrainParams[10] = {
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/8,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 27, 40, 54, 67, 94, 255, 0, 0},
+     /*point_u_scaling=*/{37, 37, 43, 48, 48, 50, 51, 51, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 107, 255, 0, 0},
+     /*point_v_scaling=*/{48, 48, 43, 33, 32, 33, 34, 34, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -2, 1,   -4,
+                                  5,   -1,  -25, -13, 3, -1, 0,   7,
+                                  -20, 103, 26,  -2,  1, 14, -49, 117},
+     /*auto_regression_coeff_u=*/{-2,  1,  -3, 4,   -4, 0,  3,   5,  -5,
+                                  -17, 17, 0,  -10, -5, -3, -30, 14, 70,
+                                  29,  9,  -2, -10, 50, 71, -11},
+     /*auto_regression_coeff_v=*/{3,   -2, -7, 6,   -7, -8, 3,   1,  -12,
+                                  -15, 28, 5,  -11, -2, -7, -27, 32, 62,
+                                  31,  18, -2, -6,  61, 43, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/7391,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 100, 102, 102, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 50, 49, 51, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 37, 37, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3,  -1, 1,   -3,
+                                  3,   1,   -27, -12, 2,  -1, 1,   7,
+                                  -17, 100, 27,  0,   -1, 13, -50, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 2,   5,  -3,
+                                  -16, 16, -2, -10, -2, -1, -31, 14, 70,
+                                  29,  9,  -1, -10, 47, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -6, 2,   1,  -10,
+                                  -14, 26, 4,  -10, -3, -5, -26, 29, 63,
+                                  31,  17, -1, -6,  55, 47, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/10772,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 49, 48, 51, 52, 52, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{49, 49, 44, 34, 32, 34, 36, 36, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  4,   1,   -26, -12, 2, -1, 1,   7,
+                                  -18, 101, 26,  -1,  0, 13, -49, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -3, 4,   -3, -1, 2,   5,  -4,
+                                  -16, 17, -2, -10, -3, -2, -31, 15, 70,
+                                  28,  9,  -1, -10, 48, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   -1, -6, 5,   -6, -7, 2,   2,  -11,
+                                  -14, 27, 5,  -11, -3, -6, -26, 30, 62,
+                                  30,  18, -2, -6,  58, 45, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/14153,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/5,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 90, 99, 98, 100, 100, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 107, 255, 0, 0, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 48, 51, 51, 0, 0, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{49, 49, 43, 33, 32, 34, 34, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  6,   0,   -26, -13, 3, -1, 1,   6,
+                                  -20, 103, 26,  -2,  1, 13, -48, 117},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 4,   -4, -1, 2,   5,  -5,
+                                  -16, 18, -1, -10, -3, -2, -30, 16, 69,
+                                  28,  9,  -2, -10, 50, 68, -11},
+     /*auto_regression_coeff_v=*/{2,   -1, -6, 5,   -6, -7, 2,   2,  -11,
+                                  -15, 29, 4,  -10, -3, -6, -26, 30, 62,
+                                  31,  18, -3, -6,  59, 45, 3},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/17534,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/8,
+     /*num_u_points=*/7,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 94, 134, 255, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{71, 71, 91, 99, 98, 101, 103, 103, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 81, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{37, 37, 49, 49, 52, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{50, 50, 44, 34, 33, 36, 37, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3, -1, 1,   -4,
+                                  3,   1,   -26, -12, 2, -1, 1,   7,
+                                  -17, 101, 26,  0,   0, 13, -50, 116},
+     /*auto_regression_coeff_u=*/{-2,  1,  -2, 3,   -3, -1, 2,   5,  -4,
+                                  -16, 16, -2, -10, -3, -1, -31, 14, 70,
+                                  28,  9,  -1, -10, 48, 70, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -6, 2,   2,  -10,
+                                  -14, 26, 4,  -10, -3, -5, -26, 29, 63,
+                                  30,  17, -1, -6,  56, 47, 3},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/20915,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/7,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 107, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 50, 52, 53, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 94, 255, 0, 0, 0},
+     /*point_v_scaling=*/{51, 51, 45, 35, 33, 36, 36, 0, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,   -27, -12, 2,  0,  1,   7,
+                                  -16, 100, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -17, 14, -3, -10, -2, 0,  -31, 14, 71,
+                                  29,  8,  -2, -10, 45, 71, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -5, 2,   1,  -9,
+                                  -14, 24, 3,  -10, -3, -4, -25, 29, 63,
+                                  31,  16, -1, -7,  54, 48, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/24296,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 134, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 91, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 134, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 50, 50, 51, 53, 53, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{50, 50, 45, 34, 33, 35, 36, 36, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{2,   -2,  -2,  10,  3,  -1, 1,   -3,
+                                  3,   2,   -27, -12, 2,  0,  1,   7,
+                                  -17, 100, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   5,  -3,
+                                  -16, 15, -2, -10, -2, -1, -31, 14, 70,
+                                  29,  8,  -1, -10, 46, 71, -11},
+     /*auto_regression_coeff_v=*/{1,   0,  -5, 5,   -6, -5, 2,   1,  -9,
+                                  -14, 25, 4,  -10, -3, -5, -25, 29, 63,
+                                  31,  17, -1, -7,  55, 47, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/27677,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/8,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 97, 101, 101, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 174, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 50, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 255, 0, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 33, 35, 37, 37, 0, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -12, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -51, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 2,   4,  -2,
+                                  -16, 14, -3, -10, -2, 0,  -31, 13, 71,
+                                  29,  8,  -2, -11, 44, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -4, 2,   1,  -9,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 63,
+                                  32,  16, -1, -7,  54, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/31058,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/9,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 98, 100, 98, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+     /*point_u_scaling=*/{38, 38, 51, 51, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 37, 37, 37, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -12, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -52, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -16, 13, -3, -10, -2, 0,  -31, 13, 71,
+                                  29,  8,  -2, -11, 44, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 4,   -6, -4, 2,   2,  -8,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 63,
+                                  32,  16, -1, -7,  54, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/34439,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0},
+    {/*apply_grain=*/true,
+     /*update_grain=*/true,
+     /*chroma_scaling_from_luma=*/false,
+     /*overlap_flag=*/true,
+     /*clip_to_restricted_range=*/false,
+     /*num_y_points=*/7,
+     /*num_u_points=*/7,
+     /*num_v_points=*/9,
+     /*point_y_value=*/{0, 13, 27, 40, 54, 121, 255, 0, 0, 0, 0, 0, 0, 0},
+     /*point_y_scaling=*/{72, 72, 92, 99, 98, 99, 95, 0, 0, 0, 0, 0, 0, 0},
+     /*point_u_value=*/{0, 13, 40, 54, 67, 228, 255, 0, 0, 0},
+     /*point_u_scaling=*/{39, 39, 51, 51, 52, 54, 54, 0, 0, 0},
+     /*point_v_value=*/{0, 13, 27, 40, 54, 67, 121, 201, 255, 0},
+     /*point_v_scaling=*/{51, 51, 46, 35, 34, 35, 36, 35, 35, 0},
+     /*chroma_scaling=*/11,
+     /*auto_regression_coeff_lag=*/3,
+     /*auto_regression_coeff_y=*/{1,   -1, -2,  9,   3,  -1, 1,   -3,
+                                  2,   2,  -28, -11, 2,  0,  1,   8,
+                                  -16, 99, 27,  0,   -1, 13, -52, 116},
+     /*auto_regression_coeff_u=*/{-3,  1,  -2, 3,   -3, -1, 1,   4,  -2,
+                                  -16, 13, -3, -10, -2, 0,  -30, 13, 71,
+                                  29,  8,  -2, -10, 43, 72, -11},
+     /*auto_regression_coeff_v=*/{0,   -1, -5, 3,   -6, -4, 2,   2,  -8,
+                                  -13, 23, 3,  -10, -3, -4, -25, 28, 64,
+                                  32,  16, -1, -7,  53, 49, 2},
+     /*auto_regression_shift=*/8,
+     /*grain_seed=*/37820,
+     /*reference_index=*/0,
+     /*grain_scale_shift=*/0,
+     /*u_multiplier=*/0,
+     /*u_luma_multiplier=*/64,
+     /*u_offset=*/0,
+     /*v_multiplier=*/0,
+     /*v_luma_multiplier=*/64,
+     /*v_offset=*/0}};
+
+const char* GetTestDigestLuma(int bitdepth, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[10] = {
+      "80da8e849110a10c0a73f9dec0d9a2fb", "54352f02aeda541e17a4c2d208897e2b",
+      "2ad9021124c82aca3e7c9517d00d1236", "f6c5f64513925b09ceba31e92511f8a1",
+      "46c6006578c68c3c8619f7a389c7de45", "fcddbd27545254dc50f1c333c8b7e313",
+      "c6d4dc181bf7f2f93ae099b836685151", "2949ef836748271195914fef9acf4e46",
+      "524e79bb87ed550e123d00a61df94381", "182222470d7b7a80017521d0261e4474",
+  };
+  static const char* const kTestDigestsLuma10bpp[10] = {
+      "27a49a2131fb6d4dd4b8c34da1b7642e", "4ea9134f6831dd398545c85b2a68e31f",
+      "4e12232a18a2b06e958d7ab6b953faad", "0ede12864ddaced2d8062ffa4225ce24",
+      "5fee492c4a430b2417a64aa4920b69e9", "39af842a3f9370d796e8ef047c0c42a8",
+      "0efbad5f9dc07391ad243232b8df1787", "2bd41882cd82960019aa2b87d5fb1fbc",
+      "1c66629c0c4e7b6f9b0a7a6944fbad50", "2c633a50ead62f8e844a409545f46244",
+  };
+
+  if (bitdepth == 8) {
+    return kTestDigestsLuma8bpp[param_index];
+  }
+  return kTestDigestsLuma10bpp[param_index];
+}
+
+const char* GetTestDigestChromaU(int bitdepth, int param_index) {
+  static const char* const kTestDigestsChromaU8bpp[10] = {
+      "e56b7bbe9f39bf987770b18aeca59514", "d0b3fd3cf2901dae31b73f20c510d83e",
+      "800c01d58d9fb72136d21ec2bb07899a", "4cd0badba679e8edbcd60a931fce49a1",
+      "cabec236cc17f91f3f08d8cde867aa72", "380a2205cf2d40c6a27152585f61a3b0",
+      "3813526234dc7f90f80f6684772c729a", "97a43a73066d88f9cbd915d56fc9c196",
+      "5b70b27a43dd63b03e23aecd3a935071", "d5cc98685582ffd47a41a97d2e377ac8",
+  };
+  static const char* const kTestDigestsChromaU10bpp[10] = {
+      "9a6d0369ba86317598e65913276dae6d", "2512bdc4c88f21f8185b040b7752d1db",
+      "1e86b779ce6555fcf5bd0ade2af67e73", "5ad463a354ffce522c52b616fb122024",
+      "290d53c22c2143b0882acb887da3fdf1", "54622407d865371d7e70bbf29fdda626",
+      "be306c6a94c55dbd9ef514f0ad4a0011", "904602329b0dec352b3b177b0a2554d2",
+      "58afc9497d968c67fdf2c0cf23b33aa3", "74fee7be6f62724bf901fdd04a733b46",
+  };
+  if (bitdepth == 8) {
+    return kTestDigestsChromaU8bpp[param_index];
+  }
+  return kTestDigestsChromaU10bpp[param_index];
+}
+
+const char* GetTestDigestChromaV(int bitdepth, int param_index) {
+  static const char* const kTestDigestsChromaV8bpp[10] = {
+      "7205ed6c07ed27b7b52d871e0559b8fa", "fad033b1482dba0ed2d450b461fa310e",
+      "6bb39798ec6a0f7bda0b0fcb0a555734", "08c19856e10123ae520ccfc63e2fbe7b",
+      "a7695a6b69fba740a50310dfa6cf1c00", "ac2eac2d13fc5b21c4f2995d5abe14b9",
+      "be35cb30062db628a9e1304fca8b75dc", "f5bfc7a910c76bcd5b32c40772170879",
+      "aca07b37d63f978d76df5cd75d0cea5e", "107c7c56d4ec21f346a1a02206301b0d",
+  };
+  static const char* const kTestDigestsChromaV10bpp[10] = {
+      "910724a77710996c90e272f1c1e9ff8e", "d293f861580770a89f1e266931a012ad",
+      "9e4f0c85fb533e51238586f9c3e68b6e", "a5ff4478d9eeb2168262c2e955e17a4f",
+      "fba6b1e8f28e4e90c836d41f28a0c154", "50b9a93f9a1f3845e6903bff9270a3e6",
+      "7b1624c3543badf5fadaee4d1e602e6b", "3be074e4ca0eec5770748b15661aaadd",
+      "639197401032f272d6c30666a2d08f43", "28075dd34246bf9d5e6197b1944f646a",
+  };
+  if (bitdepth == 8) {
+    return kTestDigestsChromaV8bpp[param_index];
+  }
+  return kTestDigestsChromaV10bpp[param_index];
+}
+
+const char* GetARTestDigestLuma(int bitdepth, int coeff_lag, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[3][kNumFilmGrainTestParams] = {
+      {
+          "a835127918f93478b45f1ba4d20d81bd",
+          "a835127918f93478b45f1ba4d20d81bd",
+          "e5db4da626e214bb17bcc7ecffa76303",
+          "a835127918f93478b45f1ba4d20d81bd",
+          "a835127918f93478b45f1ba4d20d81bd",
+          "e5db4da626e214bb17bcc7ecffa76303",
+          "a835127918f93478b45f1ba4d20d81bd",
+          "1da62b7233de502123a18546b6c97da2",
+          "1da62b7233de502123a18546b6c97da2",
+          "1da62b7233de502123a18546b6c97da2",
+      },
+      {
+          "11464b880de3ecd6e6189c5c4e7f9b28",
+          "dfe411762e283b5f49bece02ec200951",
+          "5c534d92afdf0a5b53dbe4fe7271929c",
+          "2e1a68a18aca96c31320ba7ceab59be9",
+          "584c0323e6b276cb9acb1a294d462d58",
+          "9571eb8f1cbaa96ea3bf64a820a8d9f0",
+          "305285ff0df87aba3c59e3fc0818697d",
+          "0066d35c8818cf20230114dcd3765a4d",
+          "0066d35c8818cf20230114dcd3765a4d",
+          "16d61b046084ef2636eedc5a737cb6f6",
+      },
+      {
+          "0c9e2cf1b6c3cad0f7668026e8ea0516",
+          "7d094855292d0eded9e0d1b5bab1990b",
+          "fbf28860a5f1285dcc6725a45256a86a",
+          "dccb906904160ccabbd2c9a7797a4bf9",
+          "46f645e17f08a3260b1ae70284e5c5b8",
+          "124fdc90bed11a7320a0cbdee8b94400",
+          "8d2978651dddeaef6282191fa146f0a0",
+          "28b4d5aa33f05b3fb7f9323a11936bdc",
+          "6a8ea684f6736a069e3612d1af6391a8",
+          "2781ea40a63704dbfeb3a1ac5db6f2fc",
+      },
+  };
+
+  static const char* const kTestDigestsLuma10bpp[3][kNumFilmGrainTestParams] = {
+      {
+          "5e6bc8444ece2d38420f51d82238d812",
+          "5e6bc8444ece2d38420f51d82238d812",
+          "2bfaec768794af33d60a9771f971f68d",
+          "5e6bc8444ece2d38420f51d82238d812",
+          "5e6bc8444ece2d38420f51d82238d812",
+          "c880807a368c4e82c23bea6f035ad23f",
+          "5e6bc8444ece2d38420f51d82238d812",
+          "c576667da5286183ec3aab9a76f53a2e",
+          "c576667da5286183ec3aab9a76f53a2e",
+          "c576667da5286183ec3aab9a76f53a2e",
+      },
+      {
+          "095c2dd4d4d52aff9696df9bfdb70062",
+          "983d14afa497060792d472a449a380c7",
+          "c5fdc0f7c594b2b36132cec6f45a79bd",
+          "acff232ac5597c1712213150552281d1",
+          "4dd7341923b1d260092853553b6b6246",
+          "0ca8afd71a4f564ea1ce69c4af14e9ab",
+          "9bc7565e5359d09194fcee28e4bf7b94",
+          "6fea7805458b9d149f238a30e2dc3f13",
+          "6fea7805458b9d149f238a30e2dc3f13",
+          "681dff5fc7a7244ba4e4a582ca7ecb14",
+      },
+      {
+          "cb99352c9c6300e7e825188bb4adaee0",
+          "7e40674de0209bd72f8e9c6e39ee6f7c",
+          "3e475572f6b4ecbb2730fd16751ad7ed",
+          "e6e4c63abc9cb112d9d1f23886cd1415",
+          "1a1c953b175c105c604902877e2bab18",
+          "380a53072530223d4ee622e014ee4bdb",
+          "6137394ea1172fb7ea0cbac237ff1703",
+          "85ab0c813e46f97cb9f42542f44c01ad",
+          "68c8ac462f0e28cb35402c538bee32f1",
+          "0038502ffa4760c8feb6f9abd4de7250",
+      },
+  };
+
+  if (bitdepth == 8) {
+    return kTestDigestsLuma8bpp[coeff_lag - 1][param_index];
+  }
+  return kTestDigestsLuma10bpp[coeff_lag - 1][param_index];
+}
+
+const char* GetARTestDigestChromaU(int bitdepth, int coeff_lag,
+                                   int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigestsChromaU8bpp[12] = {
+      "11ced66de0eaf55c1ff9bad18d7b8ed7", "0c3b77345dd4ab0915ef53693ab93ce4",
+      "b0645044ba080b3ceb8f299e269377d6", "50590ad5d895f0b4bc6694d878e9cd32",
+      "85e1bf3741100135062f5b4abfe7639b", "76955b70dde61ca5c7d079c501b90906",
+      "3f0995e1397fd9efd9fc46b67f7796b3", "0a0d6c3e4e1649eb101395bc97943a07",
+      "1878855ed8db600ccae1d39abac52ec6", "13ab2b28320ed3ac2b820f08fdfd424d",
+      "f3e95544a86ead5387e3dc4e043fd0f0", "ff8f5d2d97a6689e16a7e4f482f69f0b",
+  };
+
+  static const char* const kTestDigestsChromaU10bpp[12] = {
+      "707f2aa5aa7e77bc6e83ab08287d748d", "0bcf40c7fead9ac3a5d71b4cc1e21549",
+      "0c1df27053e5da7cf1276a122a8f4e8b", "782962f7425eb38923a4f87e7ab319d9",
+      "b4a709ae5967afef55530b9ea8ef0062", "70a971a0b9bf06212d510b396f0f9095",
+      "d033b89d6e31f8b13c83d94c840b7d54", "40bbe804bf3f90cee667d3b275e3c964",
+      "90bb2b9d518b945adcfd1b1807f7d170", "4bc34aa157fe5ad4270c611afa75e878",
+      "e2688d7286cd43fe0a3ea734d2ad0f77", "853193c4981bd882912171061327bdf2",
+  };
+
+  assert(!(subsampling_x == 0 && subsampling_y == 1));
+  const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigestsChromaU8bpp[base_index];
+  }
+  return kTestDigestsChromaU10bpp[base_index];
+}
+
+const char* GetARTestDigestChromaV(int bitdepth, int coeff_lag,
+                                   int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigestsChromaV8bpp[12] = {
+      "5c2179f3d93be0a0da75d2bb90347c2f", "79b883847d7eaa7890e1d633b8e34353",
+      "90ade818e55808e8cf58c11debb5ddd1", "1d0f2a14bc4df2b2a1abaf8137029f92",
+      "ac753a57ade140dccb50c14f941ae1fc", "d24ab497558f6896f08dc17bcc3c50c1",
+      "3d74436c63920022a95c85b234db4e33", "061c2d53ed84c830f454e395c362cb16",
+      "05d24869d7fb952e332457a114c8b9b7", "fcee31b87a2ada8028c2a975e094856a",
+      "c019e2c475737abcf9c2b2a52845c646", "9cd994baa7021f8bdf1d1c468c1c8e9c",
+  };
+
+  static const char* const kTestDigestsChromaV10bpp[12] = {
+      "bc9e44454a05cac8571c15af5b720e79", "f0374436698d94e879c03331b1f30df4",
+      "4580dd009abd6eeed59485057c55f63e", "7d1f7aecd45302bb461f4467f2770f72",
+      "1f0d003fce6c5fedc147c6112813f43b", "4771a45c2c1a04c375400619d5536035",
+      "df9cf619a78907c0f6e58bc13d7d5546", "dd3715ce65d905f30070a36977c818e0",
+      "32de5800f76e34c128a1d89146b4010b", "db9d7c70c3f69feb68fae04398efc773",
+      "d3d0912e3fdb956fef416a010bd7b4c2", "a2fca8abd9fd38d2eef3c4495d9eff78",
+  };
+
+  assert(!(subsampling_x == 0 && subsampling_y == 1));
+  const int base_index = 3 * coeff_lag + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigestsChromaV8bpp[base_index];
+  }
+  return kTestDigestsChromaV10bpp[base_index];
+}
+
+const char* GetGrainGenerationTestDigestLuma(int bitdepth, int param_index) {
+  static const char* const kTestDigestsLuma8bpp[kNumFilmGrainTestParams] = {
+      "c48babd99e5cfcbaa13d8b6e0c12e644", "da4b971d2de19b709e2bc98d2e50caf3",
+      "96c72faac19a79c138afeea8b8ae8c7a", "90a2b9c8304a44d14e83ca51bfd2fe8a",
+      "72bd3aa85c17850acb430afb4183bf1a", "a0acf76349b9efbc9181fc31153d9ef6",
+      "6da74dd631a4ec8b9372c0bbec22e246", "6e11fa230f0e5fbb13084255c22cabf9",
+      "be1d257b762f9880d81680e9325932a2", "37e302075af8130b371de4430e8a22cf",
+  };
+
+  static const char* const kTestDigestsLuma10bpp[kNumFilmGrainTestParams] = {
+      "0a40fd2f261095a6154584a531328142", "9d0c8173a94a0514c769e94b6f254030",
+      "7894e959fdd5545895412e1512c9352d", "6802cad2748cf6db7f66f53807ee46ab",
+      "ea24e962b98351c3d929a8ae41e320e2", "b333dc944274a3a094073889ca6e11d6",
+      "7211d7ac0ff7d11b5ef1538c0d98f43d", "ef9f9cbc101a07da7bfa62637130e331",
+      "85a122e32648fde84b883a1f98947c60", "dee656e3791138285bc5b71e3491a177",
+  };
+
+  if (bitdepth == 8) {
+    return kTestDigestsLuma8bpp[param_index];
+  }
+  return kTestDigestsLuma10bpp[param_index];
+}
+
+const char* GetConstructStripesTestDigest(int bitdepth, int overlap_flag,
+                                          int subsampling_x,
+                                          int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "cd14aaa6fc1728290fa75772730a2155", "13ad4551feadccc3a3a9bd5e25878d2a",
+      "ed6ad9532c96ef0d79ff3228c89a429f", "82f307a7f5fc3308c3ebe268b5169e70",
+      "aed793d525b85349a8c2eb6d40e93969", "311c3deb727621a7d4f18e8defb65de7",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "4fe2fa1e428737de3595be3a097d0203", "80568c3c3b53bdbbd03b820179092dcd",
+      "bc7b73099961a0739c36e027d6d09ea1", "e5331364e5146a6327fd94e1467f59a3",
+      "125bf18b7787e8f0792ea12f9210de0d", "21cf98cbce17eca77dc150cc9be0e0a0",
+  };
+
+  const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigests8bpp[base_index];
+  }
+  return kTestDigests10bpp[base_index];
+}
+
+const char* GetConstructImageTestDigest(int bitdepth, int overlap_flag,
+                                        int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "17030fc692e685557a3717f9334af7e8", "d16ea46147183cd7bc36bcfc2f936a5b",
+      "68152958540dbec885f71e3bcd7aa088", "bb43b420f05a122eb4780aca06055ab1",
+      "87567b04fbdf64f391258c0742de266b", "ce87d556048b3de32570faf6729f4010",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "5b31b29a5e22126a9bf8cd6a01645777", "2bb94a25164117f2ab18dae18e2c6577",
+      "27e57a4ed6f0c9fe0a763a03f44805e8", "481642ab0b07437b76b169aa4eb82123",
+      "656a9ef056b04565bec9ca7e0873c408", "a70fff81ab28d02d99dd4f142699ba39",
+  };
+
+  const int base_index = 3 * overlap_flag + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigests8bpp[base_index];
+  }
+  return kTestDigests10bpp[base_index];
+}
+
+const char* GetScalingInitTestDigest(int param_index, int bitdepth) {
+  static const char* const kTestDigests8bpp[kNumFilmGrainTestParams] = {
+      "315202ca3bf9c46eac8605e89baffd2a", "640f6408702b07ab7e832e7326cce56f",
+      "f75ee83e3912a3f25949e852d67326cf", "211223f5d6a4b42a8e3c662f921b71c0",
+      "f75ee83e3912a3f25949e852d67326cf", "e7a1de8c5a2cac2145c586ecf1f9051c",
+      "e7a1de8c5a2cac2145c586ecf1f9051c", "276fe5e3b30b2db2a9ff798eb6cb8e00",
+      "ac67f1c3aff2f50ed4b1975bde67ffe3", "8db6145a60d506cc94f07cef8b27c681",
+  };
+
+  static const char* const kTestDigests10bpp[kNumFilmGrainTestParams] = {
+      "c50be59c62b634ff45ddfbe5b978adfc", "7626286109a2a1eaf0a26f6b2bbab9aa",
+      "f2302988140c47a0724fc55ff523b6ec", "5318e33d8a59a526347ffa6a72ba6ebd",
+      "f2302988140c47a0724fc55ff523b6ec", "f435b5fe98e9d8b6c61fa6f457601c2c",
+      "f435b5fe98e9d8b6c61fa6f457601c2c", "ff07a2944dbe094d01e199098764941c",
+      "11b3e256c74cee2b5679f7457793869a", "89fab5c1db09e242d0494d1c696a774a",
+  };
+
+  if (bitdepth == 8) {
+    return kTestDigests8bpp[param_index];
+  }
+  assert(bitdepth == 10);
+  return kTestDigests10bpp[param_index];
+}
+
+const char* GetBlendLumaTestDigest(int bitdepth) {
+  static const char* const kTestDigest8bpp = "de35b16c702690b1d311cdd0973835d7";
+
+  static const char* const kTestDigest10bpp =
+      "60e9f24dcaaa0207a8db5ab5f3c66608";
+
+  if (bitdepth == 8) {
+    return kTestDigest8bpp;
+  }
+  return kTestDigest10bpp;
+}
+
+const char* GetBlendChromaUTestDigest(int bitdepth,
+                                      int chroma_scaling_from_luma,
+                                      int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "36ca194734d45e75079baba1f3ec9e9e", "182b388061f59fd3e24ef4581c536e67",
+      "2e7843b4c624f03316c3cbe1cc835859", "39e6d9606915da6a41168fbb006b55e4",
+      "3f44a4e252d4823544ac66a900dc7983", "1860f0831841f262d66b23f6a6b5833b",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "2054665564f55750c9588b505eb01ac0", "4d8b0e248f8a6bfc72516aa164e76b0b",
+      "7e549800a4f9fff6833bb7738e272baf", "8de6f30dcda99a37b359fd815e62d2f7",
+      "9b7958a2278a16bce2b7bc31fdd811f5", "c5c3c8cccf6a2b4e40b4a412a5bf4f08",
+  };
+
+  const int base_index =
+      3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigests8bpp[base_index];
+  }
+  return kTestDigests10bpp[base_index];
+}
+
+const char* GetBlendChromaVTestDigest(int bitdepth,
+                                      int chroma_scaling_from_luma,
+                                      int subsampling_x, int subsampling_y) {
+  static const char* const kTestDigests8bpp[6] = {
+      "9a353e4f86d7ebaa980f7f6cfc0995ad", "17589b4039ed49ba16f32db9fae724b7",
+      "76ae8bed48a173b548993b6e1824ff67", "c1458ac9bdfbf0b4d6a175343b17b27b",
+      "fa76d1c8e48957537f26af6a5b54ec14", "313fe3c34568b7f9c5ecb09d419d4ba4",
+  };
+
+  static const char* const kTestDigests10bpp[6] = {
+      "8ab5a8e03f07547260033d6a0b689e3c", "275ede58d311e2f5fd76f222f45a64fc",
+      "ce13916e0f7b02087fd0356534d32770", "165bfc8cda0266936a67fa4ec9b215cb",
+      "ed4382caa936acf1158ff8049d18ffac", "942bdd1344c9182dd7572099fb9372db",
+  };
+
+  const int base_index =
+      3 * chroma_scaling_from_luma + subsampling_x + subsampling_y;
+  if (bitdepth == 8) {
+    return kTestDigests8bpp[base_index];
+  }
+  return kTestDigests10bpp[base_index];
+}
+
+// GetFilmGrainRandomNumber() is only invoked with |bits| equal to 11 or 8. Test
+// both values of |bits|.
+TEST(FilmGrainTest, GetFilmGrainRandomNumber) {
+  uint16_t seed = 51968;
+  const struct {
+    int rand;
+    uint16_t seed;
+  } kExpected11[5] = {
+      {812, 25984}, {406, 12992}, {1227, 39264}, {1637, 52400}, {818, 26200},
+  };
+  for (int i = 0; i < 5; ++i) {
+    int rand = GetFilmGrainRandomNumber(11, &seed);
+    EXPECT_EQ(rand, kExpected11[i].rand) << "i = " << i;
+    EXPECT_EQ(seed, kExpected11[i].seed) << "i = " << i;
+  }
+  const struct {
+    int rand;
+    uint16_t seed;
+  } kExpected8[5] = {
+      {179, 45868}, {89, 22934}, {44, 11467}, {150, 38501}, {75, 19250},
+  };
+  for (int i = 0; i < 5; ++i) {
+    int rand = GetFilmGrainRandomNumber(8, &seed);
+    EXPECT_EQ(rand, kExpected8[i].rand) << "i = " << i;
+    EXPECT_EQ(seed, kExpected8[i].seed) << "i = " << i;
+  }
+}
+
+// In FilmGrainParams, if num_u_points and num_v_points are both 0 and
+// chroma_scaling_from_luma is false, GenerateChromaGrains() should set both
+// the u_grain and v_grain arrays to all zeros.
+TEST(FilmGrainTest, GenerateZeroChromaGrains) {
+  FilmGrainParams film_grain_params = {};
+  film_grain_params.apply_grain = true;
+  film_grain_params.update_grain = true;
+  film_grain_params.chroma_scaling = 8;
+  film_grain_params.auto_regression_shift = 6;
+  film_grain_params.grain_seed = 51968;
+
+  int8_t u_grain[73 * 82];
+  int8_t v_grain[73 * 82];
+  const int chroma_width = 44;
+  const int chroma_height = 38;
+
+  // Initialize u_grain and v_grain with arbitrary nonzero values.
+  memset(u_grain, 1, sizeof(u_grain));
+  memset(v_grain, 2, sizeof(v_grain));
+  for (int y = 0; y < chroma_height; ++y) {
+    for (int x = 0; x < chroma_width; ++x) {
+      EXPECT_NE(u_grain[y * chroma_width + x], 0);
+      EXPECT_NE(v_grain[y * chroma_width + x], 0);
+    }
+  }
+
+  FilmGrain<8>::GenerateChromaGrains(film_grain_params, chroma_width,
+                                     chroma_height, u_grain, v_grain);
+
+  for (int y = 0; y < chroma_height; ++y) {
+    for (int x = 0; x < chroma_width; ++x) {
+      EXPECT_EQ(u_grain[y * chroma_width + x], 0);
+      EXPECT_EQ(v_grain[y * chroma_width + x], 0);
+    }
+  }
+}
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+class AutoRegressionTestLuma
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  AutoRegressionTestLuma() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    const int index = std::get<0>(GetParam()) - 1;
+    base_luma_auto_regression_func_ =
+        dsp->film_grain.luma_auto_regression[index];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_luma_auto_regression_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    luma_auto_regression_func_ = dsp->film_grain.luma_auto_regression[index];
+  }
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestAutoRegressiveFilterLuma(int coeff_lag, int param_index,
+                                    int num_runs, bool saturate, bool compare);
+  LumaAutoRegressionFunc luma_auto_regression_func_;
+  LumaAutoRegressionFunc base_luma_auto_regression_func_;
+  GrainType luma_block_buffer_[kLumaBlockSize];
+  GrainType base_luma_block_buffer_[kLumaBlockSize];
+};
+
+// First parameter is coefficient lag. Second parameter is the index into
+// |kFilmGrainParams|.
+template <int bitdepth>
+void AutoRegressionTestLuma<bitdepth>::TestAutoRegressiveFilterLuma(
+    int coeff_lag, int param_index, int num_runs, bool saturate, bool compare) {
+  if (luma_auto_regression_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_luma_auto_regression_func_ == nullptr && compare) return;
+  FilmGrainParams params = kFilmGrainParams[param_index];
+  params.auto_regression_coeff_lag = coeff_lag;
+  const int grain_max = GetGrainMax<bitdepth>();
+  for (int y = 0; y < kLumaHeight; ++y) {
+    for (int x = 0; x < kLumaWidth; ++x) {
+      if (saturate) {
+        luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+      } else {
+        luma_block_buffer_[y * kLumaWidth + x] =
+            std::min(x - (kLumaWidth >> 1), y - (kLumaHeight >> 1)) *
+            (1 << (bitdepth - 8));
+      }
+    }
+  }
+
+  if (saturate) {
+    memset(params.auto_regression_coeff_y, 127,
+           sizeof(params.auto_regression_coeff_y));
+  }
+  if (compare) {
+    memcpy(base_luma_block_buffer_, luma_block_buffer_,
+           sizeof(luma_block_buffer_));
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    luma_auto_regression_func_(params, luma_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("AutoRegressionLuma lag=%d, param_index=%d: %d us\n", coeff_lag,
+           param_index,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_luma_auto_regression_func_(params, base_luma_block_buffer_);
+    EXPECT_TRUE(test_utils::CompareBlocks(
+        luma_block_buffer_, base_luma_block_buffer_, kLumaWidth, kLumaHeight,
+        kLumaWidth, kLumaWidth, false));
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionLuma lag=%d, param_index=%d", coeff_lag,
+                        param_index)
+            .c_str(),
+        GetARTestDigestLuma(bitdepth, coeff_lag, param_index),
+        luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+  }
+}
+
+using AutoRegressionTestLuma8bpp = AutoRegressionTestLuma<8>;
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLuma) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/false,
+                               /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, AutoRegressiveFilterLumaSaturated) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/true,
+                               /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma8bpp, DISABLED_Speed) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1e5,
+                               /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestLuma10bpp = AutoRegressionTestLuma<10>;
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLuma) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/false,
+                               /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, AutoRegressiveFilterLumaSaturated) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1, /*saturate=*/true,
+                               /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestLuma10bpp, DISABLED_Speed) {
+  TestAutoRegressiveFilterLuma(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                               1e5,
+                               /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AutoRegressionTestLuma8bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AutoRegressionTestLuma8bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(
+    C, AutoRegressionTestLuma10bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AutoRegressionTestLuma10bpp,
+    testing::Combine(testing::Range(1, 4) /* coeff_lag */,
+                     testing::Range(0, 10) /* param_index */));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct AutoRegressionChromaTestParam {
+  explicit AutoRegressionChromaTestParam(const std::tuple<int, int>& in)
+      : coeff_lag(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int coeff_lag;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth>
+class AutoRegressionTestChroma
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  AutoRegressionTestChroma() {
+    AutoRegressionChromaTestParam test_param(GetParam());
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    // This test suite does not cover num_y_points == 0. This should be covered
+    // in the test of the full synthesis process.
+    base_chroma_auto_regression_func_ =
+        dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_chroma_auto_regression_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    chroma_auto_regression_func_ =
+        dsp->film_grain.chroma_auto_regression[1][test_param.coeff_lag];
+  }
+
+  ~AutoRegressionTestChroma() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestAutoRegressiveFilterChroma(int coeff_lag, int subsampling_x,
+                                      int subsampling_y, int num_runs,
+                                      bool saturate, bool compare);
+  ChromaAutoRegressionFunc chroma_auto_regression_func_;
+  ChromaAutoRegressionFunc base_chroma_auto_regression_func_;
+  GrainType luma_block_buffer_[kLumaBlockSize];
+  GrainType u_block_buffer_[kChromaBlockSize];
+  GrainType v_block_buffer_[kChromaBlockSize];
+  GrainType base_u_block_buffer_[kChromaBlockSize];
+  GrainType base_v_block_buffer_[kChromaBlockSize];
+};
+
+template <int bitdepth>
+void AutoRegressionTestChroma<bitdepth>::TestAutoRegressiveFilterChroma(
+    int coeff_lag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (chroma_auto_regression_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_chroma_auto_regression_func_ == nullptr && compare) return;
+
+  // This function relies on the first set of sampled params for basics. The
+  // test param generators are used for coverage.
+  FilmGrainParams params = kFilmGrainParams[0];
+  params.auto_regression_coeff_lag = coeff_lag;
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int chroma_width =
+      (subsampling_x != 0) ? kMinChromaWidth : kMaxChromaWidth;
+  const int chroma_height =
+      (subsampling_y != 0) ? kMinChromaHeight : kMaxChromaHeight;
+  if (saturate) {
+    memset(params.auto_regression_coeff_u, 127,
+           sizeof(params.auto_regression_coeff_u));
+    memset(params.auto_regression_coeff_v, 127,
+           sizeof(params.auto_regression_coeff_v));
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+        luma_block_buffer_[y * kLumaWidth + x] = grain_max;
+        u_block_buffer_[y * kLumaWidth + x] = grain_max;
+        v_block_buffer_[y * kLumaWidth + x] = grain_max;
+      }
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        // This loop relies on the fact that kMaxChromaWidth == kLumaWidth.
+        const int random_y = rnd(random_range);
+        luma_block_buffer_[y * kLumaWidth + x] = random_y + grain_min;
+        const int random_u = rnd(random_range);
+        u_block_buffer_[y * kLumaWidth + x] = random_u + grain_min;
+        const int random_v = rnd(random_range);
+        v_block_buffer_[y * kLumaWidth + x] = random_v + grain_min;
+      }
+    }
+  }
+  if (compare) {
+    memcpy(base_u_block_buffer_, u_block_buffer_, sizeof(u_block_buffer_));
+    memcpy(base_v_block_buffer_, v_block_buffer_, sizeof(v_block_buffer_));
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+                                 subsampling_y, u_block_buffer_,
+                                 v_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("AutoRegressionChroma lag=%d, sub_x=%d, sub_y=%d: %d us\n",
+           coeff_lag, subsampling_x, subsampling_y,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_chroma_auto_regression_func_(params, luma_block_buffer_, subsampling_x,
+                                      subsampling_y, base_u_block_buffer_,
+                                      base_v_block_buffer_);
+    EXPECT_TRUE(test_utils::CompareBlocks(u_block_buffer_, base_u_block_buffer_,
+                                          chroma_width, chroma_height,
+                                          chroma_width, chroma_width, false));
+    EXPECT_TRUE(test_utils::CompareBlocks(v_block_buffer_, base_v_block_buffer_,
+                                          chroma_width, chroma_height,
+                                          chroma_width, chroma_width, false));
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionChromaU lag=%d, sub_x=%d, sub_y=%d",
+                        coeff_lag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetARTestDigestChromaU(bitdepth, coeff_lag, subsampling_x,
+                               subsampling_y),
+        u_block_buffer_, sizeof(u_block_buffer_), elapsed_time);
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("AutoRegressionChromaV lag=%d, sub_x=%d, sub_y=%d",
+                        coeff_lag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetARTestDigestChromaV(bitdepth, coeff_lag, subsampling_x,
+                               subsampling_y),
+        v_block_buffer_, sizeof(v_block_buffer_), elapsed_time);
+  }
+}
+
+using AutoRegressionTestChroma8bpp = AutoRegressionTestChroma<8>;
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChroma) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1,
+                                 /*saturate=*/false,
+                                 /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, AutoRegressiveFilterChromaSaturated) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1, /*saturate=*/true,
+                                 /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma8bpp, DISABLED_Speed) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(
+      test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+      // Subsampling cuts each dimension of the chroma blocks in half, so run
+      // twice as many times to compensate.
+      1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+      /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using AutoRegressionTestChroma10bpp = AutoRegressionTestChroma<10>;
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChroma) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1,
+                                 /*saturate=*/false,
+                                 /*compare=*/false);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, AutoRegressiveFilterChromaSaturated) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(test_param.coeff_lag, test_param.subsampling_x,
+                                 test_param.subsampling_y, 1, /*saturate=*/true,
+                                 /*compare=*/true);
+}
+
+TEST_P(AutoRegressionTestChroma10bpp, DISABLED_Speed) {
+  AutoRegressionChromaTestParam test_param(GetParam());
+  TestAutoRegressiveFilterChroma(
+      test_param.coeff_lag, test_param.subsampling_x, test_param.subsampling_y,
+      // Subsampling cuts each dimension of the chroma blocks in half, so run
+      // twice as many times to compensate.
+      1e5 * (1 << (test_param.subsampling_y + test_param.subsampling_x)),
+      /*saturate=*/false, /*compare=*/false);
+}
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma8bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, AutoRegressionTestChroma10bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma8bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(NEON, AutoRegressionTestChroma10bpp,
+                         testing::Combine(testing::Range(0, 4) /* coeff_lag */,
+                                          testing::Range(0,
+                                                         3) /* subsampling */));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+#endif  // LIBGAV1_ENABLE_NEON
+
+template <int bitdepth>
+class GrainGenerationTest : public testing::TestWithParam<int> {
+ protected:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  void TestGenerateGrainLuma(int param_index, int num_runs);
+
+  GrainType luma_block_buffer_[kLumaBlockSize];
+};
+
+template <int bitdepth>
+void GrainGenerationTest<bitdepth>::TestGenerateGrainLuma(int param_index,
+                                                          int num_runs) {
+  FilmGrainParams params = kFilmGrainParams[param_index];
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    FilmGrain<bitdepth>::GenerateLumaGrain(params, luma_block_buffer_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs == 1) {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("GenerateGrainLuma param_index=%d", param_index)
+            .c_str(),
+        GetGrainGenerationTestDigestLuma(bitdepth, param_index),
+        luma_block_buffer_, sizeof(luma_block_buffer_), elapsed_time);
+  } else {
+    printf("GenerateGrainLuma param_index=%d: %d us\n", param_index,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+}
+
+using GrainGenerationTest8bpp = GrainGenerationTest<8>;
+
+TEST_P(GrainGenerationTest8bpp, GenerateGrainLuma) {
+  TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest8bpp, DISABLED_LumaSpeed) {
+  TestGenerateGrainLuma(GetParam(), 1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using GrainGenerationTest10bpp = GrainGenerationTest<10>;
+
+TEST_P(GrainGenerationTest10bpp, GenerateGrainLuma) {
+  TestGenerateGrainLuma(GetParam(), 1);
+}
+
+TEST_P(GrainGenerationTest10bpp, DISABLED_LumaSpeed) {
+  TestGenerateGrainLuma(GetParam(), 1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest8bpp,
+                         testing::Range(0, 10) /* param_index */);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, GrainGenerationTest10bpp,
+                         testing::Range(0, 10) /* param_index */);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// This param type is used for both ConstructStripesTest and
+// ConstructImageTest.
+struct ConstructNoiseTestParam {
+  explicit ConstructNoiseTestParam(const std::tuple<int, int>& in)
+      : overlap_flag(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int overlap_flag;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth>
+class ConstructStripesTest
+    : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  ConstructStripesTest() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    base_construct_noise_stripes_func_ =
+        dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_construct_noise_stripes_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    construct_noise_stripes_func_ =
+        dsp->film_grain.construct_noise_stripes[std::get<0>(GetParam())];
+  }
+
+  ~ConstructStripesTest() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestConstructNoiseStripes(int overlap_flag, int subsampling_x,
+                                 int subsampling_y, int num_runs, bool saturate,
+                                 bool compare);
+  ConstructNoiseStripesFunc construct_noise_stripes_func_;
+  ConstructNoiseStripesFunc base_construct_noise_stripes_func_;
+  GrainType grain_buffer_[kLumaBlockSize];
+  Array2DView<GrainType> noise_stripes_;
+  // Owns the memory that noise_stripes_ points to.
+  std::unique_ptr<GrainType[]> stripe_buffer_;
+  Array2DView<GrainType> base_noise_stripes_;
+  // Owns the memory that base_stripe_buffer_ points to.
+  std::unique_ptr<GrainType[]> base_stripe_buffer_;
+};
+
+template <int bitdepth>
+void ConstructStripesTest<bitdepth>::TestConstructNoiseStripes(
+    int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (construct_noise_stripes_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_construct_noise_stripes_func_ == nullptr && compare) return;
+
+  const int stripe_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+  const int stripe_height = kNoiseStripeHeight;
+  const int stripe_size = stripe_height * stripe_width;
+  const int stripe_buffer_size = stripe_size * kNumTestStripes;
+  if (compare) {
+    base_stripe_buffer_.reset(new (
+        std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+    ASSERT_NE(base_stripe_buffer_, nullptr);
+    base_noise_stripes_.Reset(kNumTestStripes, stripe_size,
+                              base_stripe_buffer_.get());
+  }
+  stripe_buffer_.reset(
+      new (std::nothrow) GrainType[stripe_buffer_size + kNoiseStripePadding]());
+  ASSERT_NE(stripe_buffer_, nullptr);
+  noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  if (saturate) {
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        grain_buffer_[y * kLumaWidth + x] = grain_max;
+      }
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int y = 0; y < kLumaHeight; ++y) {
+      for (int x = 0; x < kLumaWidth; ++x) {
+        grain_buffer_[y * kLumaWidth + x] = grain_min + rnd(random_range);
+      }
+    }
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth, kFrameHeight,
+                                  subsampling_x, subsampling_y,
+                                  &noise_stripes_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf(
+        "ConstructNoiseStripes Speed Test for overlap=%d, sub_x=%d, "
+        "sub_y=%d: %d us\n",
+        overlap_flag, subsampling_x, subsampling_y,
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    base_construct_noise_stripes_func_(grain_buffer_, 68, kFrameWidth,
+                                       kFrameHeight, subsampling_x,
+                                       subsampling_y, &base_noise_stripes_);
+
+    constexpr int kCompareWidth = 64;
+    for (int stripe = 0; stripe < kNumTestStripes;) {
+      EXPECT_TRUE(test_utils::CompareBlocks(
+          noise_stripes_[stripe], base_noise_stripes_[stripe], kCompareWidth,
+          stripe_height, stripe_width, stripe_width, /*check_padding=*/false,
+          /*print_diff=*/false));
+    }
+  } else {
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("ConstructNoiseStripes overlap=%d, sub_x=%d, sub_y=%d",
+                        overlap_flag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetConstructStripesTestDigest(bitdepth, overlap_flag, subsampling_x,
+                                      subsampling_y),
+        noise_stripes_[0], stripe_buffer_size, elapsed_time);
+  }
+}
+
+using ConstructStripesTest8bpp = ConstructStripesTest<8>;
+
+TEST_P(ConstructStripesTest8bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructStripesTest8bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/true, /*compare=*/true);
+}
+TEST_P(ConstructStripesTest8bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/500,
+                            /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructStripesTest10bpp = ConstructStripesTest<10>;
+
+TEST_P(ConstructStripesTest10bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/false, /*compare=*/false);
+}
+TEST_P(ConstructStripesTest10bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/1,
+                            /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructStripesTest10bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseStripes(test_params.overlap_flag, test_params.subsampling_x,
+                            test_params.subsampling_y, /*num_runs=*/500,
+                            /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructStripesTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+class ConstructImageTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  ConstructImageTest() {
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+    base_construct_noise_image_overlap_func_ =
+        dsp->film_grain.construct_noise_image_overlap;
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "C/")) {
+      base_construct_noise_image_overlap_func_ = nullptr;
+    } else if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    construct_noise_image_overlap_func_ =
+        dsp->film_grain.construct_noise_image_overlap;
+  }
+
+  ~ConstructImageTest() override = default;
+
+ protected:
+  // |compare| determines whether to compare the output blocks from the SIMD
+  // implementation, if used, and the C implementation.
+  // |saturate| determines whether to set the inputs to maximum values. This is
+  // intended primarily as a way to simplify differences in output when
+  // debugging.
+  void TestConstructNoiseImage(int overlap_flag, int subsampling_x,
+                               int subsampling_y, int num_runs, bool saturate,
+                               bool compare);
+  ConstructNoiseImageOverlapFunc construct_noise_image_overlap_func_;
+  ConstructNoiseImageOverlapFunc base_construct_noise_image_overlap_func_;
+  Array2DView<GrainType> noise_stripes_;
+  // Owns the memory that noise_stripes_ points to.
+  std::unique_ptr<GrainType[]> stripe_buffer_;
+  Array2D<GrainType> noise_image_;
+  Array2D<GrainType> base_noise_image_;
+};
+
+template <int bitdepth>
+void ConstructImageTest<bitdepth>::TestConstructNoiseImage(
+    int overlap_flag, int subsampling_x, int subsampling_y, int num_runs,
+    bool saturate, bool compare) {
+  if (construct_noise_image_overlap_func_ == nullptr) return;
+  // Compare is only needed for NEON tests to compare with C output.
+  if (base_construct_noise_image_overlap_func_ == nullptr && compare) return;
+
+  const int image_width = ((kFrameWidth + subsampling_x) >> subsampling_x);
+  const int image_height = ((kFrameHeight + subsampling_y) >> subsampling_y);
+  const int stripe_height =
+      ((kNoiseStripeHeight + subsampling_y) >> subsampling_y);
+  const int image_stride = image_width + kNoiseImagePadding;
+  const int stripe_size = stripe_height * image_width;
+  if (compare) {
+    ASSERT_TRUE(base_noise_image_.Reset(image_height, image_stride,
+                                        /*zero_initialize=*/false));
+  }
+  ASSERT_TRUE(noise_image_.Reset(image_height, image_stride,
+                                 /*zero_initialize=*/false));
+  // Stride between stripe rows is |image_width|. Padding is only at the
+  // end of the final row of the final stripe to protect from overreads.
+  stripe_buffer_.reset(
+      new (std::nothrow)
+          GrainType[kNumTestStripes * stripe_size + kNoiseStripePadding]);
+  ASSERT_NE(stripe_buffer_, nullptr);
+  noise_stripes_.Reset(kNumTestStripes, stripe_size, stripe_buffer_.get());
+
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  if (saturate) {
+    for (int i = 0; i < stripe_size; ++i) {
+      noise_stripes_[0][i] = grain_max;
+    }
+    for (int stripe = 1; stripe < kNumTestStripes; ++stripe) {
+      memcpy(noise_stripes_[stripe], noise_stripes_[0],
+             stripe_size * sizeof(noise_stripes_[0][0]));
+    }
+  } else {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    // Allow any valid grain values.
+    const int random_range = grain_max - grain_min + 1;
+    for (int stripe = 0; stripe < kNumTestStripes; ++stripe) {
+      // Assign all allocated memory for this stripe.
+      for (int i = 0; i < stripe_height; ++i) {
+        for (int x = 0; x < image_width; ++x) {
+          noise_stripes_[stripe][i * image_width + x] =
+              grain_min + rnd(random_range);
+        }
+      }
+    }
+  }
+
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    FilmGrain<bitdepth>::ConstructNoiseImage(
+        &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+        subsampling_y, overlap_flag << (1 - subsampling_y), &noise_image_);
+    if (overlap_flag == 1) {
+      construct_noise_image_overlap_func_(&noise_stripes_, kFrameWidth,
+                                          kFrameHeight, subsampling_x,
+                                          subsampling_y, &noise_image_);
+    }
+  }
+
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf(
+        "ConstructNoiseImage Speed Test for overlap=%d, sub_x=%d, "
+        "sub_y=%d: %d us\n",
+        overlap_flag, subsampling_x, subsampling_y,
+        static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  if (compare) {
+    FilmGrain<bitdepth>::ConstructNoiseImage(
+        &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+        subsampling_y, overlap_flag << (1 - subsampling_y), &base_noise_image_);
+    if (overlap_flag == 1) {
+      base_construct_noise_image_overlap_func_(
+          &noise_stripes_, kFrameWidth, kFrameHeight, subsampling_x,
+          subsampling_y, &base_noise_image_);
+    }
+    constexpr int kCompareWidth = 72;
+    constexpr int kCompareHeight = 72;
+    EXPECT_TRUE(test_utils::CompareBlocks(
+        noise_image_[0], base_noise_image_[0], kCompareWidth, kCompareHeight,
+        image_stride, image_stride, /*check_padding=*/false,
+        /*print_diff=*/false));
+  } else {
+    printf("BD%d \"%s\",\n", bitdepth,
+           test_utils::GetMd5Sum(noise_image_[0], image_width, image_height,
+                                 image_stride)
+               .c_str());
+    test_utils::CheckMd5Digest(
+        "FilmGrain",
+        absl::StrFormat("ConstructNoiseImage overlap=%d, sub_x=%d, sub_y=%d",
+                        overlap_flag, subsampling_x, subsampling_y)
+            .c_str(),
+        GetConstructImageTestDigest(bitdepth, overlap_flag, subsampling_x,
+                                    subsampling_y),
+        noise_image_[0], image_width, image_height, image_stride, elapsed_time);
+  }
+}
+
+using ConstructImageTest8bpp = ConstructImageTest<8>;
+
+TEST_P(ConstructImageTest8bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest8bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest8bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/500,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ConstructImageTest10bpp = ConstructImageTest<10>;
+
+TEST_P(ConstructImageTest10bpp, RandomValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/false, /*compare=*/false);
+}
+
+TEST_P(ConstructImageTest10bpp, SaturatedValues) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/1,
+                          /*saturate=*/true, /*compare=*/true);
+}
+
+TEST_P(ConstructImageTest10bpp, DISABLED_Speed) {
+  ConstructNoiseTestParam test_params(GetParam());
+  TestConstructNoiseImage(test_params.overlap_flag, test_params.subsampling_x,
+                          test_params.subsampling_y, /*num_runs=*/500,
+                          /*saturate=*/false, /*compare=*/false);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ConstructImageTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_ENABLE_NEON
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ConstructImageTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth>
+class ScalingLookupTableTest : public testing::TestWithParam<int> {
+ public:
+  ScalingLookupTableTest() {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    }
+    initialize_func_ = dsp->film_grain.initialize_scaling_lut;
+  }
+  ~ScalingLookupTableTest() override = default;
+
+ protected:
+  void TestSpeed(int num_runs);
+  void ZeroPoints();
+
+ private:
+  static constexpr int kScalingLutBufferLength =
+      (kScalingLookupTableSize + kScalingLookupTablePadding) << (bitdepth - 8);
+  dsp::InitializeScalingLutFunc initialize_func_;
+  int16_t scaling_lut_[kScalingLutBufferLength];
+};
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::TestSpeed(int num_runs) {
+  if (initialize_func_ == nullptr) return;
+  const int param_index = GetParam();
+  const FilmGrainParams& params = kFilmGrainParams[param_index];
+  const absl::Time start = absl::Now();
+  Memset(scaling_lut_, 0, kScalingLutBufferLength);
+  for (int i = 0; i < num_runs; ++i) {
+    initialize_func_(params.num_y_points, params.point_y_value,
+                     params.point_y_scaling, scaling_lut_,
+                     kScalingLutBufferLength);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  if (num_runs > 1) {
+    printf("InitializeScalingLut: %d us\n",
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+    return;
+  }
+  test_utils::CheckMd5Digest(
+      "FilmGrain",
+      absl::StrFormat("InitializeScalingLut for param set: %d", param_index)
+          .c_str(),
+      GetScalingInitTestDigest(param_index, bitdepth), scaling_lut_,
+      (sizeof(scaling_lut_[0]) * kScalingLookupTableSize) << (bitdepth - 8),
+      elapsed_time);
+}
+
+template <int bitdepth>
+void ScalingLookupTableTest<bitdepth>::ZeroPoints() {
+  if (initialize_func_ == nullptr) return;
+  const int param_index = GetParam();
+  const FilmGrainParams& params = kFilmGrainParams[param_index];
+  initialize_func_(0, params.point_y_value, params.point_y_scaling,
+                   scaling_lut_, kScalingLookupTableSize);
+  for (int i = 0; i < kScalingLookupTableSize; ++i) {
+    ASSERT_EQ(scaling_lut_[i], 0);
+  }
+}
+
+using ScalingLookupTableTest8bpp = ScalingLookupTableTest<8>;
+
+TEST_P(ScalingLookupTableTest8bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest8bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest8bpp, DISABLED_Speed) {
+  TestSpeed(/*num_runs=*/1e5);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using ScalingLookupTableTest10bpp = ScalingLookupTableTest<10>;
+
+TEST_P(ScalingLookupTableTest10bpp, ZeroPoints) { ZeroPoints(); }
+
+TEST_P(ScalingLookupTableTest10bpp, Correctness) { TestSpeed(/*num_runs=*/1); }
+
+TEST_P(ScalingLookupTableTest10bpp, DISABLED_Speed) {
+  TestSpeed(/*num_runs=*/1e5);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest8bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest8bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+INSTANTIATE_TEST_SUITE_P(C, ScalingLookupTableTest10bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScalingLookupTableTest10bpp,
+                         testing::Range(0, kNumFilmGrainTestParams));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+struct BlendNoiseTestParam {
+  explicit BlendNoiseTestParam(const std::tuple<int, int>& in)
+      : chroma_scaling_from_luma(std::get<0>(in)) {
+    switch (std::get<1>(in)) {
+      case 0:
+        subsampling_x = 0;
+        subsampling_y = 0;
+        break;
+      case 1:
+        subsampling_x = 1;
+        subsampling_y = 0;
+        break;
+      default:
+        assert(std::get<1>(in) == 2);
+        subsampling_x = 1;
+        subsampling_y = 1;
+    }
+  }
+  const int chroma_scaling_from_luma;
+  int subsampling_x;
+  int subsampling_y;
+};
+
+template <int bitdepth, typename Pixel>
+class BlendNoiseTest : public testing::TestWithParam<std::tuple<int, int>> {
+ public:
+  using GrainType =
+      typename std::conditional<bitdepth == 8, int8_t, int16_t>::type;
+
+  BlendNoiseTest() {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+    const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      FilmGrainInit_SSE4_1();
+    }
+    const BlendNoiseTestParam test_param(GetParam());
+    chroma_scaling_from_luma_ = test_param.chroma_scaling_from_luma;
+    blend_luma_func_ = dsp->film_grain.blend_noise_luma;
+    blend_chroma_func_ =
+        dsp->film_grain.blend_noise_chroma[chroma_scaling_from_luma_];
+    subsampling_x_ = test_param.subsampling_x;
+    subsampling_y_ = test_param.subsampling_y;
+
+    uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+    uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+    uv_stride_ = uv_width_ * sizeof(Pixel);
+    y_stride_ = width_ * sizeof(Pixel);
+    const size_t buffer_size =
+        sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_ +
+                         3 * kBorderPixelsFilmGrain);
+    source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+    dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+    source_plane_y_ = source_buffer_.get();
+    source_plane_u_ =
+        source_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+    source_plane_v_ =
+        source_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+    dest_plane_y_ = dest_buffer_.get();
+    dest_plane_u_ =
+        dest_plane_y_ + y_stride_ * height_ + kBorderPixelsFilmGrain;
+    dest_plane_v_ =
+        dest_plane_u_ + uv_stride_ * uv_height_ + kBorderPixelsFilmGrain;
+  }
+  ~BlendNoiseTest() override = default;
+
+ protected:
+  void TestSpeed(int num_runs);
+
+ private:
+  static constexpr int kScalingLutBufferLength =
+      (kScalingLookupTableSize + kScalingLookupTablePadding) << 2;
+
+  void ConvertScalingLut10bpp(int16_t* scaling_lut_10bpp,
+                              const int16_t* src_scaling_lut);
+  dsp::BlendNoiseWithImageLumaFunc blend_luma_func_;
+  dsp::BlendNoiseWithImageChromaFunc blend_chroma_func_;
+
+  const int width_ = 1921;
+  const int height_ = 1081;
+  int chroma_scaling_from_luma_ = 0;
+  int subsampling_x_ = 0;
+  int subsampling_y_ = 0;
+  int uv_width_ = 0;
+  int uv_height_ = 0;
+  int uv_stride_ = 0;
+  int y_stride_ = 0;
+  // This holds the data that |source_plane_y_|, |source_plane_u_|, and
+  // |source_plane_v_| point to.
+  std::unique_ptr<uint8_t[]> source_buffer_;
+  // This holds the data that |dest_plane_y_|, |dest_plane_u_|, and
+  // |dest_plane_v_| point to.
+  std::unique_ptr<uint8_t[]> dest_buffer_;
+  uint8_t* source_plane_y_ = nullptr;
+  uint8_t* source_plane_u_ = nullptr;
+  uint8_t* source_plane_v_ = nullptr;
+  uint8_t* dest_plane_y_ = nullptr;
+  uint8_t* dest_plane_u_ = nullptr;
+  uint8_t* dest_plane_v_ = nullptr;
+  Array2D<GrainType> noise_image_[kMaxPlanes];
+  int16_t scaling_lut_10bpp_y_[kScalingLutBufferLength];
+  int16_t scaling_lut_10bpp_u_[kScalingLutBufferLength];
+  int16_t scaling_lut_10bpp_v_[kScalingLutBufferLength];
+};
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::ConvertScalingLut10bpp(
+    int16_t* scaling_lut_10bpp, const int16_t* src_scaling_lut) {
+  for (int i = 0; i < kScalingLookupTableSize - 1; ++i) {
+    const int x_base = i << 2;
+    const int start = src_scaling_lut[i];
+    const int end_index = std::min(i + 1, kScalingLookupTableSize - 1);
+    const int end = src_scaling_lut[end_index];
+    const int delta = end - start;
+    scaling_lut_10bpp[x_base] = start;
+    scaling_lut_10bpp[x_base + 1] = start + RightShiftWithRounding(delta, 2);
+    scaling_lut_10bpp[x_base + 2] =
+        start + RightShiftWithRounding(2 * delta, 2);
+    scaling_lut_10bpp[x_base + 3] =
+        start + RightShiftWithRounding(3 * delta, 2);
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void BlendNoiseTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+  if (blend_chroma_func_ == nullptr || blend_luma_func_ == nullptr) return;
+  ASSERT_TRUE(noise_image_[kPlaneY].Reset(height_,
+                                          width_ + kBorderPixelsFilmGrain,
+                                          /*zero_initialize=*/false));
+  ASSERT_TRUE(noise_image_[kPlaneU].Reset(uv_height_,
+                                          uv_width_ + kBorderPixelsFilmGrain,
+                                          /*zero_initialize=*/false));
+  ASSERT_TRUE(noise_image_[kPlaneV].Reset(uv_height_,
+                                          uv_width_ + kBorderPixelsFilmGrain,
+                                          /*zero_initialize=*/false));
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  // Allow any valid grain values.
+  const int grain_max = GetGrainMax<bitdepth>();
+  const int grain_min = GetGrainMin<bitdepth>();
+  const int random_range = grain_max - grain_min + 1;
+  auto* src_y = reinterpret_cast<Pixel*>(source_plane_y_);
+  auto* src_u = reinterpret_cast<Pixel*>(source_plane_u_);
+  auto* src_v = reinterpret_cast<Pixel*>(source_plane_v_);
+  for (int y = 0; y < height_; ++y) {
+    for (int x = 0; x < width_; ++x) {
+      const int random_source_y = rnd(random_range);
+      // Populating the luma source ensures the lookup table is tested. Chroma
+      // planes are given identical values. Giving them different values would
+      // artificially differentiate the outputs. It's important that the test
+      // expect that different outputs are caused by the different scaling
+      // lookup tables, rather than by different inputs.
+      const int uv_y_pos = y >> subsampling_y_;
+      const int uv_x_pos = x >> subsampling_x_;
+      src_y[y * width_ + x] = random_source_y;
+      src_u[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+      src_v[uv_y_pos * uv_width_ + uv_x_pos] = random_source_y;
+      const int random_y = rnd(random_range);
+      noise_image_[kPlaneY][y][x] = random_y + grain_min;
+      const int random_u = rnd(random_range);
+      noise_image_[kPlaneU][uv_y_pos][uv_x_pos] = random_u + grain_min;
+      const int random_v = rnd(random_range);
+      noise_image_[kPlaneV][uv_y_pos][uv_x_pos] = random_v + grain_min;
+    }
+  }
+  static constexpr int16_t kTestScalingLutY[kScalingLookupTableSize] = {
+      72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  72,  73,
+      75,  76,  77,  79,  80,  81,  83,  84,  86,  87,  88,  90,  91,  92,  92,
+      93,  93,  94,  95,  95,  96,  97,  97,  98,  98,  99,  99,  99,  99,  98,
+      98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  100, 100,
+      100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+      102, 102,
+  };
+  static constexpr int16_t kTestScalingLutU[kScalingLookupTableSize] = {
+      30,  42,  53,  65,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,
+      75,  76,  78,  79,  81,  82,  83,  85,  86,  88,  89,  91,  92,  93,  93,
+      94,  94,  95,  95,  96,  96,  97,  97,  98,  98,  99,  99,  99,  99,  99,
+      99,  99,  99,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  99,  99,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  100, 100, 100, 100, 100, 100, 100, 100, 100,
+      100, 100, 100, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
+      110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+      98,  98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  97,  97,  97,  97,  97,  97,  97,  96,  96,  96,  96,  96,
+      96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,
+      96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  96,  95,
+      95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,
+      95,  95,
+  };
+  static constexpr int16_t kTestScalingLutV[kScalingLookupTableSize] = {
+      73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  74,  74,  74,
+      75,  75,  78,  79,  81,  82,  83,  85,  86,  88,  89,  91,  92,  93,  93,
+      94,  94,  95,  95,  96,  96,  97,  97,  98,  98,  99,  99,  99,  99,  98,
+      98,  98,  98,  98,  98,  98,  97,  97,  97,  97,  97,  97,  97,  97,  97,
+      97,  97,  97,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,  98,
+      98,  98,  98,  98,  98,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,
+      99,  99,  99,  99,  99,  99,  100, 100, 100, 100, 100, 100, 100, 100, 100,
+      100, 100, 100, 100, 100, 100, 100, 100, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
+      150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150, 150,
+      180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, 180,
+      200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
+      255, 255,
+  };
+
+  if (bitdepth == 10) {
+    for (int i = 0; i < kScalingLutBufferLength; ++i) {
+      ConvertScalingLut10bpp(scaling_lut_10bpp_y_, kTestScalingLutY);
+      ConvertScalingLut10bpp(scaling_lut_10bpp_u_, kTestScalingLutU);
+      ConvertScalingLut10bpp(scaling_lut_10bpp_v_, kTestScalingLutV);
+    }
+  }
+  const FilmGrainParams& params = kFilmGrainParams[0];
+  const int min_value = 16 << (bitdepth - 8);
+  const int max_value = 235 << (bitdepth - 8);
+  const absl::Time start = absl::Now();
+  for (int i = 0; i < num_runs; ++i) {
+    if (chroma_scaling_from_luma_) {
+      blend_chroma_func_(
+          kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+          source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+          dest_plane_u_, uv_stride_);
+      blend_chroma_func_(
+          kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+          source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+          dest_plane_v_, uv_stride_);
+    } else {
+      blend_chroma_func_(
+          kPlaneU, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_u_ : kTestScalingLutU,
+          source_plane_y_, y_stride_, source_plane_u_, uv_stride_,
+          dest_plane_u_, uv_stride_);
+      blend_chroma_func_(
+          kPlaneV, params, noise_image_, min_value, max_value, width_, height_,
+          /*start_height=*/0, subsampling_x_, subsampling_y_,
+          (bitdepth == 10) ? scaling_lut_10bpp_v_ : kTestScalingLutV,
+          source_plane_y_, y_stride_, source_plane_v_, uv_stride_,
+          dest_plane_v_, uv_stride_);
+    }
+    blend_luma_func_(noise_image_, min_value, max_value, params.chroma_scaling,
+                     width_, height_, /*start_height=*/0,
+                     (bitdepth == 10) ? scaling_lut_10bpp_y_ : kTestScalingLutY,
+                     source_plane_y_, y_stride_, dest_plane_y_, y_stride_);
+  }
+  const absl::Duration elapsed_time = absl::Now() - start;
+  const char* digest_luma = GetBlendLumaTestDigest(bitdepth);
+  printf("YBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_y_, y_stride_ * height_).c_str());
+  printf("UBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_u_, uv_stride_ * uv_height_).c_str());
+  printf("VBD%d \"%s\",\n", bitdepth,
+         test_utils::GetMd5Sum(dest_plane_v_, uv_stride_ * uv_height_).c_str());
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("Luma cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_luma, dest_plane_y_, y_stride_ * height_, elapsed_time);
+  const char* digest_chroma_u = GetBlendChromaUTestDigest(
+      bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("ChromaU cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_chroma_u, dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+  const char* digest_chroma_v = GetBlendChromaVTestDigest(
+      bitdepth, chroma_scaling_from_luma_, subsampling_x_, subsampling_y_);
+  test_utils::CheckMd5Digest(
+      "BlendNoiseWithImage",
+      absl::StrFormat("ChromaV cfl=%d, sub_x=%d, sub_y=%d",
+                      chroma_scaling_from_luma_, subsampling_x_, subsampling_y_)
+          .c_str(),
+      digest_chroma_v, dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+}
+
+using BlendNoiseTest8bpp = BlendNoiseTest<8, uint8_t>;
+
+TEST_P(BlendNoiseTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest8bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using BlendNoiseTest10bpp = BlendNoiseTest<10, uint16_t>;
+
+TEST_P(BlendNoiseTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(BlendNoiseTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BlendNoiseTest10bpp,
+                         testing::Combine(testing::Range(0, 2),
+                                          testing::Range(0, 3)));
+#endif
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+template <int bitdepth, typename Pixel>
+class FilmGrainSpeedTest : public testing::TestWithParam<int> {
+ public:
+  FilmGrainSpeedTest() {
+    test_utils::ResetDspTable(bitdepth);
+    FilmGrainInit_C();
+
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    const char* const test_case = test_info->test_suite_name();
+    if (absl::StartsWith(test_case, "NEON/")) {
+#if LIBGAV1_ENABLE_NEON
+      FilmGrainInit_NEON();
+#endif
+    } else if (absl::StartsWith(test_case, "SSE41/")) {
+      FilmGrainInit_SSE4_1();
+    }
+    uv_width_ = (width_ + subsampling_x_) >> subsampling_x_;
+    uv_height_ = (height_ + subsampling_y_) >> subsampling_y_;
+    uv_stride_ = uv_width_ * sizeof(Pixel);
+    y_stride_ = width_ * sizeof(Pixel);
+    const size_t buffer_size =
+        sizeof(Pixel) * (width_ * height_ + 2 * uv_width_ * uv_height_);
+    source_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(source_buffer_.get(), 0, sizeof(source_buffer_[0]) * buffer_size);
+    dest_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]);
+    memset(dest_buffer_.get(), 0, sizeof(dest_buffer_[0]) * buffer_size);
+    source_plane_y_ = source_buffer_.get();
+    source_plane_u_ = source_plane_y_ + y_stride_ * height_;
+    source_plane_v_ = source_plane_u_ + uv_stride_ * uv_height_;
+    dest_plane_y_ = dest_buffer_.get();
+    dest_plane_u_ = dest_plane_y_ + y_stride_ * height_;
+    dest_plane_v_ = dest_plane_u_ + uv_stride_ * uv_height_;
+    const int num_threads = GetParam();
+    thread_pool_ = ThreadPool::Create(num_threads);
+  }
+  ~FilmGrainSpeedTest() override = default;
+
+ protected:
+  void TestSpeed(int num_runs);
+
+ private:
+  const int width_ = 1920;
+  const int height_ = 1080;
+  const int subsampling_x_ = 1;
+  const int subsampling_y_ = 1;
+  int uv_width_ = 0;
+  int uv_height_ = 0;
+  int uv_stride_ = 0;
+  int y_stride_ = 0;
+  std::unique_ptr<uint8_t[]> source_buffer_;
+  std::unique_ptr<uint8_t[]> dest_buffer_;
+  const uint8_t* source_plane_y_ = nullptr;
+  const uint8_t* source_plane_u_ = nullptr;
+  const uint8_t* source_plane_v_ = nullptr;
+  uint8_t* dest_plane_y_ = nullptr;
+  uint8_t* dest_plane_u_ = nullptr;
+  uint8_t* dest_plane_v_ = nullptr;
+  std::unique_ptr<ThreadPool> thread_pool_;
+};
+
+// Each run of the speed test adds film grain noise to 10 dummy frames. The
+// film grain parameters for the 10 frames were generated with aomenc.
+template <int bitdepth, typename Pixel>
+void FilmGrainSpeedTest<bitdepth, Pixel>::TestSpeed(const int num_runs) {
+  const dsp::Dsp* dsp = GetDspTable(bitdepth);
+  if (dsp->film_grain.blend_noise_chroma[0] == nullptr ||
+      dsp->film_grain.blend_noise_luma == nullptr) {
+    return;
+  }
+  for (int k = 0; k < kNumFilmGrainTestParams; ++k) {
+    const FilmGrainParams& params = kFilmGrainParams[k];
+    const absl::Time start = absl::Now();
+    for (int i = 0; i < num_runs; ++i) {
+      FilmGrain<bitdepth> film_grain(params, /*is_monochrome=*/false,
+                                     /*color_matrix_is_identity=*/false,
+                                     subsampling_x_, subsampling_y_, width_,
+                                     height_, thread_pool_.get());
+      EXPECT_TRUE(film_grain.AddNoise(
+          source_plane_y_, y_stride_, source_plane_u_, source_plane_v_,
+          uv_stride_, dest_plane_y_, y_stride_, dest_plane_u_, dest_plane_v_,
+          uv_stride_));
+    }
+    const absl::Duration elapsed_time = absl::Now() - start;
+    const char* digest_luma = GetTestDigestLuma(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisLuma",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_luma,
+        dest_plane_y_, y_stride_ * height_, elapsed_time);
+    const char* digest_chroma_u = GetTestDigestChromaU(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisChromaU",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_u,
+        dest_plane_u_, uv_stride_ * uv_height_, elapsed_time);
+    const char* digest_chroma_v = GetTestDigestChromaV(bitdepth, k);
+    test_utils::CheckMd5Digest(
+        "FilmGrainSynthesisChromaV",
+        absl::StrFormat("kFilmGrainParams[%d]", k).c_str(), digest_chroma_v,
+        dest_plane_v_, uv_stride_ * uv_height_, elapsed_time);
+  }
+}
+
+using FilmGrainSpeedTest8bpp = FilmGrainSpeedTest<8, uint8_t>;
+
+TEST_P(FilmGrainSpeedTest8bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest8bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest8bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest8bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest8bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using FilmGrainSpeedTest10bpp = FilmGrainSpeedTest<10, uint16_t>;
+
+TEST_P(FilmGrainSpeedTest10bpp, MatchesOriginalOutput) { TestSpeed(1); }
+
+TEST_P(FilmGrainSpeedTest10bpp, DISABLED_Speed) { TestSpeed(kNumSpeedTests); }
+
+INSTANTIATE_TEST_SUITE_P(C, FilmGrainSpeedTest10bpp, testing::Values(0, 3, 8));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, FilmGrainSpeedTest10bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FilmGrainSpeedTest10bpp,
+                         testing::Values(0, 3, 8));
+#endif
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+}  // namespace film_grain
+}  // namespace dsp
+}  // namespace libgav1
diff --git a/src/frame_buffer.cc b/src/frame_buffer.cc
new file mode 100644
index 0000000..50c7756
--- /dev/null
+++ b/src/frame_buffer.cc
@@ -0,0 +1,151 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/frame_buffer.h"
+
+#include <cstdint>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+
+extern "C" {
+
+Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info) {
+  switch (bitdepth) {
+    case 8:
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    case 10:
+#endif
+#if LIBGAV1_MAX_BITDEPTH == 12
+    case 12:
+#endif
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+    case kLibgav1ImageFormatYuv422:
+    case kLibgav1ImageFormatYuv444:
+    case kLibgav1ImageFormatMonochrome400:
+      break;
+    default:
+      return kLibgav1StatusInvalidArgument;
+  }
+  // All int arguments must be nonnegative. Borders must be a multiple of 2.
+  // |stride_alignment| must be a power of 2.
+  if ((width | height | left_border | right_border | top_border |
+       bottom_border | stride_alignment) < 0 ||
+      ((left_border | right_border | top_border | bottom_border) & 1) != 0 ||
+      (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  bool is_monochrome;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x,
+                                &subsampling_y);
+
+  // Calculate y_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+  y_stride = libgav1::Align(y_stride, stride_alignment);
+  // Size of the Y buffer in bytes.
+  const uint64_t y_buffer_size =
+      (height + top_border + bottom_border) * static_cast<uint64_t>(y_stride) +
+      (stride_alignment - 1);
+
+  const int uv_width =
+      is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  // Calculate uv_stride (in bytes). It is padded to a multiple of
+  // |stride_alignment| bytes.
+  int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+  uv_stride = libgav1::Align(uv_stride, stride_alignment);
+  // Size of the U or V buffer in bytes.
+  const uint64_t uv_buffer_size =
+      is_monochrome ? 0
+                    : (uv_height + uv_top_border + uv_bottom_border) *
+                              static_cast<uint64_t>(uv_stride) +
+                          (stride_alignment - 1);
+
+  // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t.
+  if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) {
+    return kLibgav1StatusInvalidArgument;
+  }
+
+  int left_border_bytes = left_border;
+  int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    left_border_bytes *= sizeof(uint16_t);
+    uv_left_border_bytes *= sizeof(uint16_t);
+  }
+#endif
+
+  info->y_stride = y_stride;
+  info->uv_stride = uv_stride;
+  info->y_buffer_size = static_cast<size_t>(y_buffer_size);
+  info->uv_buffer_size = static_cast<size_t>(uv_buffer_size);
+  info->y_plane_offset = top_border * y_stride + left_border_bytes;
+  info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes;
+  info->stride_alignment = stride_alignment;
+  return kLibgav1StatusOk;
+}
+
+Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info,
+                                        uint8_t* y_buffer, uint8_t* u_buffer,
+                                        uint8_t* v_buffer,
+                                        void* buffer_private_data,
+                                        Libgav1FrameBuffer* frame_buffer) {
+  if (info == nullptr ||
+      (info->uv_buffer_size == 0 &&
+       (u_buffer != nullptr || v_buffer != nullptr)) ||
+      frame_buffer == nullptr) {
+    return kLibgav1StatusInvalidArgument;
+  }
+  if (y_buffer == nullptr || (info->uv_buffer_size != 0 &&
+                              (u_buffer == nullptr || v_buffer == nullptr))) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset,
+                                              info->stride_alignment);
+  frame_buffer->stride[0] = info->y_stride;
+  frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride;
+  frame_buffer->private_data = buffer_private_data;
+  return kLibgav1StatusOk;
+}
+
+}  // extern "C"
diff --git a/src/frame_buffer_utils.h b/src/frame_buffer_utils.h
new file mode 100644
index 0000000..d41437e
--- /dev/null
+++ b/src/frame_buffer_utils.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// The following table is from Section 6.4.2 of the spec.
+//
+// subsampling_x  subsampling_y  mono_chrome  Description
+// -----------------------------------------------------------
+// 0              0              0            YUV 4:4:4
+// 1              0              0            YUV 4:2:2
+// 1              1              0            YUV 4:2:0
+// 1              1              1            Monochrome 4:0:0
+
+inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome,
+                                             int8_t subsampling_x,
+                                             int8_t subsampling_y) {
+  Libgav1ImageFormat image_format;
+  if (subsampling_x == 0) {
+    assert(subsampling_y == 0 && !is_monochrome);
+    image_format = kLibgav1ImageFormatYuv444;
+  } else if (subsampling_y == 0) {
+    assert(!is_monochrome);
+    image_format = kLibgav1ImageFormatYuv422;
+  } else if (!is_monochrome) {
+    image_format = kLibgav1ImageFormatYuv420;
+  } else {
+    image_format = kLibgav1ImageFormatMonochrome400;
+  }
+  return image_format;
+}
+
+inline void DecomposeImageFormat(Libgav1ImageFormat image_format,
+                                 bool* is_monochrome, int8_t* subsampling_x,
+                                 int8_t* subsampling_y) {
+  *is_monochrome = false;
+  *subsampling_x = 1;
+  *subsampling_y = 1;
+  switch (image_format) {
+    case kLibgav1ImageFormatYuv420:
+      break;
+    case kLibgav1ImageFormatYuv422:
+      *subsampling_y = 0;
+      break;
+    case kLibgav1ImageFormatYuv444:
+      *subsampling_x = *subsampling_y = 0;
+      break;
+    default:
+      assert(image_format == kLibgav1ImageFormatMonochrome400);
+      *is_monochrome = true;
+      break;
+  }
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_
diff --git a/src/frame_scratch_buffer.h b/src/frame_scratch_buffer.h
new file mode 100644
index 0000000..1b0d2e0
--- /dev/null
+++ b/src/frame_scratch_buffer.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
+
+#include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/loop_restoration_info.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/threading_strategy.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Buffer used to store the unfiltered pixels that are necessary for decoding
+// the next superblock row (for the intra prediction process).
+using IntraPredictionBuffer =
+    std::array<AlignedDynamicBuffer<uint8_t, kMaxAlignment>, kMaxPlanes>;
+
+// Buffer to facilitate decoding a frame. This struct is used only within
+// DecoderImpl::DecodeTiles().
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context and the TileScratchBufferPool member
+// tile_scratch_buffer_pool.
+struct FrameScratchBuffer : public MaxAlignedAllocable {
+  LoopRestorationInfo loop_restoration_info;
+  Array2D<int8_t> cdef_index;
+  // Encodes the block skip information as a bitmask for the entire frame which
+  // will be used by the cdef process.
+  //
+  // * The size of this array is rows4x4 / 2 * column4x4 / 16.
+  // * Each row of the bitmasks array (cdef_skip) stores the bitmask for 2 rows
+  // of 4x4 blocks.
+  // * Each entry in the row will store the skip information for 16 4x4 blocks
+  // (8 bits).
+  // * If any of the four 4x4 blocks in the 8x8 block is not a skip block, then
+  // the corresponding bit (as described below) will be set to 1.
+  // * For the 4x4 block at column4x4 the bit index is (column4x4 >> 1).
+  Array2D<uint8_t> cdef_skip;
+  Array2D<TransformSize> inter_transform_sizes;
+  BlockParametersHolder block_parameters_holder;
+  TemporalMotionField motion_field;
+  SymbolDecoderContext symbol_decoder_context;
+  std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
+  // Buffer used to store the cdef borders. This buffer will store 4 rows for
+  // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+  // indices of the rows that are stored are specified in |kCdefBorderRows|.
+  YuvBuffer cdef_border;
+  AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
+  // Buffer used to temporarily store the input row for applying SuperRes.
+  YuvBuffer superres_line_buffer;
+  // Buffer used to store the loop restoration borders. This buffer will store 4
+  // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+  // subsampling). The indices of the rows that are stored are specified in
+  // |kLoopRestorationBorderRows|.
+  YuvBuffer loop_restoration_border;
+  // The size of this dynamic buffer is |tile_rows|.
+  DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
+  TileScratchBufferPool tile_scratch_buffer_pool;
+  ThreadingStrategy threading_strategy;
+  std::mutex superblock_row_mutex;
+  // The size of this buffer is the number of superblock rows.
+  // |superblock_row_progress[i]| is incremented whenever a tile finishes
+  // decoding superblock row at index i. If the count reaches tile_columns, then
+  // |superblock_row_progress_condvar[i]| is notified.
+  DynamicBuffer<int> superblock_row_progress
+      LIBGAV1_GUARDED_BY(superblock_row_mutex);
+  // The size of this buffer is the number of superblock rows. Used to wait for
+  // |superblock_row_progress[i]| to reach tile_columns.
+  DynamicBuffer<std::condition_variable> superblock_row_progress_condvar;
+  // Used to signal tile decoding failure in the combined multithreading mode.
+  bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex);
+};
+
+class FrameScratchBufferPool {
+ public:
+  std::unique_ptr<FrameScratchBuffer> Get() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (!buffers_.Empty()) {
+      return buffers_.Pop();
+    }
+    lock.unlock();
+    std::unique_ptr<FrameScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                           FrameScratchBuffer);
+    return scratch_buffer;
+  }
+
+  void Release(std::unique_ptr<FrameScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  Stack<std::unique_ptr<FrameScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_
diff --git a/src/gav1/decoder.h b/src/gav1/decoder.h
new file mode 100644
index 0000000..da08da9
--- /dev/null
+++ b/src/gav1/decoder.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_H_
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+// IWYU pragma: begin_exports
+#include "gav1/decoder_buffer.h"
+#include "gav1/decoder_settings.h"
+#include "gav1/frame_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+#include "gav1/version.h"
+// IWYU pragma: end_exports
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct Libgav1Decoder;
+typedef struct Libgav1Decoder Libgav1Decoder;
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate(
+    const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out);
+
+LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame(
+    Libgav1Decoder* decoder, const uint8_t* data, size_t size,
+    int64_t user_private_data, void* buffer_private_data);
+
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr);
+
+LIBGAV1_PUBLIC Libgav1StatusCode
+Libgav1DecoderSignalEOS(Libgav1Decoder* decoder);
+
+LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Forward declaration.
+class DecoderImpl;
+
+class LIBGAV1_PUBLIC Decoder {
+ public:
+  Decoder();
+  ~Decoder();
+
+  // Init must be called exactly once per instance. Subsequent calls will do
+  // nothing. If |settings| is nullptr, the decoder will be initialized with
+  // default settings. Returns kStatusOk on success, an error status otherwise.
+  StatusCode Init(const DecoderSettings* settings);
+
+  // Enqueues a compressed frame to be decoded.
+  //
+  // This function returns:
+  //   * kStatusOk on success
+  //   * kStatusTryAgain if the decoder queue is full
+  //   * an error status otherwise.
+  //
+  // |user_private_data| may be used to associate application specific private
+  // data with the compressed frame. It will be copied to the user_private_data
+  // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
+  // call.
+  //
+  // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
+  // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
+  // alive until:
+  // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer
+  // must be kept alive until release_input_buffer is called with the
+  // |buffer_private_data| passed into this EnqueueFrame call.
+  // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must
+  // be kept alive until the corresponding DequeueFrame() call is completed.
+  //
+  // If the call to |EnqueueFrame()| is not successful, then libgav1 will not
+  // hold any references to the |data| buffer. |settings_.release_input_buffer|
+  // callback will not be called in that case.
+  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
+                          int64_t user_private_data, void* buffer_private_data);
+
+  // Dequeues a decompressed frame. If there are enqueued compressed frames,
+  // decodes one and sets |*out_ptr| to the last displayable frame in the
+  // compressed frame. If there are no displayable frames available, sets
+  // |*out_ptr| to nullptr.
+  //
+  // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are
+  // no enqueued frames (in this case out_ptr will always be set to nullptr).
+  // Returns one of the other error statuses if there is an error.
+  //
+  // If |settings_.blocking_dequeue| is false and the decoder is operating in
+  // frame parallel mode (|settings_.frame_parallel| is true and the video
+  // stream passes the decoder's heuristics for enabling frame parallel mode),
+  // then this call will return kStatusTryAgain if an enqueued frame is not yet
+  // decoded (it is a non blocking call in this case). In all other cases, this
+  // call will block until an enqueued frame has been decoded.
+  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
+
+  // Signals the end of stream.
+  //
+  // In non-frame-parallel mode, this function will release all the frames held
+  // by the decoder. If the frame buffers were allocated by libgav1, then the
+  // pointer obtained by the prior DequeueFrame call will no longer be valid. If
+  // the frame buffers were allocated by the application, then any references
+  // that libgav1 is holding on to will be released.
+  //
+  // Once this function returns successfully, the decoder state will be reset
+  // and the decoder is ready to start decoding a new coded video sequence.
+  StatusCode SignalEOS();
+
+  // Returns the maximum bitdepth that is supported by this decoder.
+  static int GetMaxBitdepth();
+
+ private:
+  DecoderSettings settings_;
+  // The object is initialized if and only if impl_ != nullptr.
+  std::unique_ptr<DecoderImpl> impl_;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_H_
diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h
new file mode 100644
index 0000000..880c320
--- /dev/null
+++ b/src/gav1/decoder_buffer.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+// The documentation for the enum values in this file can be found in Section
+// 6.4.2 of the AV1 spec.
+
+typedef enum Libgav1ChromaSamplePosition {
+  kLibgav1ChromaSamplePositionUnknown,
+  kLibgav1ChromaSamplePositionVertical,
+  kLibgav1ChromaSamplePositionColocated,
+  kLibgav1ChromaSamplePositionReserved
+} Libgav1ChromaSamplePosition;
+
+typedef enum Libgav1ImageFormat {
+  kLibgav1ImageFormatYuv420,
+  kLibgav1ImageFormatYuv422,
+  kLibgav1ImageFormatYuv444,
+  kLibgav1ImageFormatMonochrome400
+} Libgav1ImageFormat;
+
+typedef enum Libgav1ColorPrimary {
+  // 0 is reserved.
+  kLibgav1ColorPrimaryBt709 = 1,
+  kLibgav1ColorPrimaryUnspecified,
+  // 3 is reserved.
+  kLibgav1ColorPrimaryBt470M = 4,
+  kLibgav1ColorPrimaryBt470Bg,
+  kLibgav1ColorPrimaryBt601,
+  kLibgav1ColorPrimarySmpte240,
+  kLibgav1ColorPrimaryGenericFilm,
+  kLibgav1ColorPrimaryBt2020,
+  kLibgav1ColorPrimaryXyz,
+  kLibgav1ColorPrimarySmpte431,
+  kLibgav1ColorPrimarySmpte432,
+  // 13-21 are reserved.
+  kLibgav1ColorPrimaryEbu3213 = 22,
+  // 23-254 are reserved.
+  kLibgav1MaxColorPrimaries = 255
+} Libgav1ColorPrimary;
+
+typedef enum Libgav1TransferCharacteristics {
+  // 0 is reserved.
+  kLibgav1TransferCharacteristicsBt709 = 1,
+  kLibgav1TransferCharacteristicsUnspecified,
+  // 3 is reserved.
+  kLibgav1TransferCharacteristicsBt470M = 4,
+  kLibgav1TransferCharacteristicsBt470Bg,
+  kLibgav1TransferCharacteristicsBt601,
+  kLibgav1TransferCharacteristicsSmpte240,
+  kLibgav1TransferCharacteristicsLinear,
+  kLibgav1TransferCharacteristicsLog100,
+  kLibgav1TransferCharacteristicsLog100Sqrt10,
+  kLibgav1TransferCharacteristicsIec61966,
+  kLibgav1TransferCharacteristicsBt1361,
+  kLibgav1TransferCharacteristicsSrgb,
+  kLibgav1TransferCharacteristicsBt2020TenBit,
+  kLibgav1TransferCharacteristicsBt2020TwelveBit,
+  kLibgav1TransferCharacteristicsSmpte2084,
+  kLibgav1TransferCharacteristicsSmpte428,
+  kLibgav1TransferCharacteristicsHlg,
+  // 19-254 are reserved.
+  kLibgav1MaxTransferCharacteristics = 255
+} Libgav1TransferCharacteristics;
+
+typedef enum Libgav1MatrixCoefficients {
+  kLibgav1MatrixCoefficientsIdentity,
+  kLibgav1MatrixCoefficientsBt709,
+  kLibgav1MatrixCoefficientsUnspecified,
+  // 3 is reserved.
+  kLibgav1MatrixCoefficientsFcc = 4,
+  kLibgav1MatrixCoefficientsBt470BG,
+  kLibgav1MatrixCoefficientsBt601,
+  kLibgav1MatrixCoefficientsSmpte240,
+  kLibgav1MatrixCoefficientsSmpteYcgco,
+  kLibgav1MatrixCoefficientsBt2020Ncl,
+  kLibgav1MatrixCoefficientsBt2020Cl,
+  kLibgav1MatrixCoefficientsSmpte2085,
+  kLibgav1MatrixCoefficientsChromatNcl,
+  kLibgav1MatrixCoefficientsChromatCl,
+  kLibgav1MatrixCoefficientsIctcp,
+  // 15-254 are reserved.
+  kLibgav1MaxMatrixCoefficients = 255
+} Libgav1MatrixCoefficients;
+
+typedef enum Libgav1ColorRange {
+  // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit
+  // streams.
+  kLibgav1ColorRangeStudio,  // Y [16..235], UV [16..240]
+  kLibgav1ColorRangeFull     // YUV/RGB [0..255]
+} Libgav1ColorRange;
+
+typedef struct Libgav1DecoderBuffer {
+#if defined(__cplusplus)
+  LIBGAV1_PUBLIC int NumPlanes() const {
+    return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3;
+  }
+#endif  // defined(__cplusplus)
+
+  Libgav1ChromaSamplePosition chroma_sample_position;
+  Libgav1ImageFormat image_format;
+  Libgav1ColorRange color_range;
+  Libgav1ColorPrimary color_primary;
+  Libgav1TransferCharacteristics transfer_characteristics;
+  Libgav1MatrixCoefficients matrix_coefficients;
+
+  int bitdepth;  // Stored image bitdepth.
+
+  // Image display dimensions in Y/U/V order.
+  int displayed_width[3];   // Displayed image width.
+  int displayed_height[3];  // Displayed image height.
+
+  // Values are given in Y/U/V order.
+  int stride[3];      // The width in bytes of one row of the |plane| buffer.
+                      // This may include padding bytes for alignment or
+                      // internal use by the decoder.
+  uint8_t* plane[3];  // The reconstructed image plane(s).
+
+  // Spatial id of this frame.
+  int spatial_id;
+  // Temporal id of this frame.
+  int temporal_id;
+
+  // The |user_private_data| argument passed to Decoder::EnqueueFrame().
+  int64_t user_private_data;
+  // The |private_data| field of FrameBuffer. Set by the get frame buffer
+  // callback when it allocates a frame buffer.
+  void* buffer_private_data;
+} Libgav1DecoderBuffer;
+
+#if defined(__cplusplus)
+namespace libgav1 {
+
+using ChromaSamplePosition = Libgav1ChromaSamplePosition;
+constexpr ChromaSamplePosition kChromaSamplePositionUnknown =
+    kLibgav1ChromaSamplePositionUnknown;
+constexpr ChromaSamplePosition kChromaSamplePositionVertical =
+    kLibgav1ChromaSamplePositionVertical;
+constexpr ChromaSamplePosition kChromaSamplePositionColocated =
+    kLibgav1ChromaSamplePositionColocated;
+constexpr ChromaSamplePosition kChromaSamplePositionReserved =
+    kLibgav1ChromaSamplePositionReserved;
+
+using ImageFormat = Libgav1ImageFormat;
+constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420;
+constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422;
+constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444;
+constexpr ImageFormat kImageFormatMonochrome400 =
+    kLibgav1ImageFormatMonochrome400;
+
+using ColorPrimary = Libgav1ColorPrimary;
+constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709;
+constexpr ColorPrimary kColorPrimaryUnspecified =
+    kLibgav1ColorPrimaryUnspecified;
+constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M;
+constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg;
+constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601;
+constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240;
+constexpr ColorPrimary kColorPrimaryGenericFilm =
+    kLibgav1ColorPrimaryGenericFilm;
+constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020;
+constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz;
+constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431;
+constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432;
+constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213;
+constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries;
+
+using TransferCharacteristics = Libgav1TransferCharacteristics;
+constexpr TransferCharacteristics kTransferCharacteristicsBt709 =
+    kLibgav1TransferCharacteristicsBt709;
+constexpr TransferCharacteristics kTransferCharacteristicsUnspecified =
+    kLibgav1TransferCharacteristicsUnspecified;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470M =
+    kLibgav1TransferCharacteristicsBt470M;
+constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg =
+    kLibgav1TransferCharacteristicsBt470Bg;
+constexpr TransferCharacteristics kTransferCharacteristicsBt601 =
+    kLibgav1TransferCharacteristicsBt601;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 =
+    kLibgav1TransferCharacteristicsSmpte240;
+constexpr TransferCharacteristics kTransferCharacteristicsLinear =
+    kLibgav1TransferCharacteristicsLinear;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100 =
+    kLibgav1TransferCharacteristicsLog100;
+constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 =
+    kLibgav1TransferCharacteristicsLog100Sqrt10;
+constexpr TransferCharacteristics kTransferCharacteristicsIec61966 =
+    kLibgav1TransferCharacteristicsIec61966;
+constexpr TransferCharacteristics kTransferCharacteristicsBt1361 =
+    kLibgav1TransferCharacteristicsBt1361;
+constexpr TransferCharacteristics kTransferCharacteristicsSrgb =
+    kLibgav1TransferCharacteristicsSrgb;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit =
+    kLibgav1TransferCharacteristicsBt2020TenBit;
+constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit =
+    kLibgav1TransferCharacteristicsBt2020TwelveBit;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 =
+    kLibgav1TransferCharacteristicsSmpte2084;
+constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 =
+    kLibgav1TransferCharacteristicsSmpte428;
+constexpr TransferCharacteristics kTransferCharacteristicsHlg =
+    kLibgav1TransferCharacteristicsHlg;
+constexpr TransferCharacteristics kMaxTransferCharacteristics =
+    kLibgav1MaxTransferCharacteristics;
+
+using MatrixCoefficients = Libgav1MatrixCoefficients;
+constexpr MatrixCoefficients kMatrixCoefficientsIdentity =
+    kLibgav1MatrixCoefficientsIdentity;
+constexpr MatrixCoefficients kMatrixCoefficientsBt709 =
+    kLibgav1MatrixCoefficientsBt709;
+constexpr MatrixCoefficients kMatrixCoefficientsUnspecified =
+    kLibgav1MatrixCoefficientsUnspecified;
+constexpr MatrixCoefficients kMatrixCoefficientsFcc =
+    kLibgav1MatrixCoefficientsFcc;
+constexpr MatrixCoefficients kMatrixCoefficientsBt470BG =
+    kLibgav1MatrixCoefficientsBt470BG;
+constexpr MatrixCoefficients kMatrixCoefficientsBt601 =
+    kLibgav1MatrixCoefficientsBt601;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 =
+    kLibgav1MatrixCoefficientsSmpte240;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco =
+    kLibgav1MatrixCoefficientsSmpteYcgco;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl =
+    kLibgav1MatrixCoefficientsBt2020Ncl;
+constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl =
+    kLibgav1MatrixCoefficientsBt2020Cl;
+constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 =
+    kLibgav1MatrixCoefficientsSmpte2085;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl =
+    kLibgav1MatrixCoefficientsChromatNcl;
+constexpr MatrixCoefficients kMatrixCoefficientsChromatCl =
+    kLibgav1MatrixCoefficientsChromatCl;
+constexpr MatrixCoefficients kMatrixCoefficientsIctcp =
+    kLibgav1MatrixCoefficientsIctcp;
+constexpr MatrixCoefficients kMaxMatrixCoefficients =
+    kLibgav1MaxMatrixCoefficients;
+
+using ColorRange = Libgav1ColorRange;
+constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio;
+constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull;
+
+using DecoderBuffer = Libgav1DecoderBuffer;
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_
diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h
new file mode 100644
index 0000000..7ee487f
--- /dev/null
+++ b/src/gav1/decoder_settings.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
+
+#if defined(__cplusplus)
+#include <cstdint>
+#else
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/frame_buffer.h"
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI.
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This callback is invoked by the decoder when it is done using an input frame
+// buffer. When frame_parallel is set to true, this callback must not be
+// nullptr. Otherwise, this callback is optional.
+//
+// |buffer_private_data| is the value passed in the EnqueueFrame() call.
+typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+typedef struct Libgav1DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads;
+  // A boolean. Indicate to the decoder that frame parallel decoding is allowed.
+  // Note that this is just a request and the decoder will decide the number of
+  // frames to be decoded in parallel based on the video stream being decoded.
+  int frame_parallel;
+  // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait
+  // until a enqueued frame is available for dequeueing.
+  //
+  // If frame_parallel is 0, this setting is ignored.
+  int blocking_dequeue;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed;
+  // Get frame buffer callback.
+  Libgav1GetFrameBufferCallback get_frame_buffer;
+  // Release frame buffer callback.
+  Libgav1ReleaseFrameBufferCallback release_frame_buffer;
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
+  Libgav1ReleaseInputBufferCallback release_input_buffer;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data;
+  // A boolean. If set to 1, the decoder will output all the spatial and
+  // temporal layers.
+  int output_all_layers;
+  // Index of the operating point to decode.
+  int operating_point;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask;
+} Libgav1DecoderSettings;
+
+LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault(
+    Libgav1DecoderSettings* settings);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback;
+
+// Applications must populate this structure before creating a decoder instance.
+struct DecoderSettings {
+  // Number of threads to use when decoding. Must be greater than 0. The library
+  // will create at most |threads| new threads. Defaults to 1 (no new threads
+  // will be created).
+  int threads = 1;
+  // Indicate to the decoder that frame parallel decoding is allowed. Note that
+  // this is just a request and the decoder will decide the number of frames to
+  // be decoded in parallel based on the video stream being decoded.
+  bool frame_parallel = false;
+  // In frame parallel mode, should DequeueFrame wait until a enqueued frame is
+  // available for dequeueing.
+  //
+  // If frame_parallel is false, this setting is ignored.
+  bool blocking_dequeue = false;
+  // Called when the first sequence header or a sequence header with a
+  // different frame size (which includes bitdepth, monochrome, subsampling_x,
+  // subsampling_y, maximum frame width, or maximum frame height) is received.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr;
+  // Get frame buffer callback.
+  GetFrameBufferCallback get_frame_buffer = nullptr;
+  // Release frame buffer callback.
+  ReleaseFrameBufferCallback release_frame_buffer = nullptr;
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
+  ReleaseInputBufferCallback release_input_buffer = nullptr;
+  // Passed as the private_data argument to the callbacks.
+  void* callback_private_data = nullptr;
+  // If set to true, the decoder will output all the spatial and temporal
+  // layers.
+  bool output_all_layers = false;
+  // Index of the operating point to decode.
+  int operating_point = 0;
+  // Mask indicating the post processing filters that need to be applied to the
+  // reconstructed frame. Note this is an advanced setting and does not
+  // typically need to be changed.
+  // From LSB:
+  //   Bit 0: Loop filter (deblocking filter).
+  //   Bit 1: Cdef.
+  //   Bit 2: SuperRes.
+  //   Bit 3: Loop restoration.
+  //   Bit 4: Film grain synthesis.
+  //   All the bits other than the last 5 are ignored.
+  uint8_t post_filter_mask = 0x1f;
+};
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+#endif  // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_
diff --git a/src/gav1/frame_buffer.h b/src/gav1/frame_buffer.h
new file mode 100644
index 0000000..8132b61
--- /dev/null
+++ b/src/gav1/frame_buffer.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+#if defined(__cplusplus)
+#include <cstddef>
+#include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif  // defined(__cplusplus)
+
+#include "gav1/decoder_buffer.h"
+#include "gav1/status_code.h"
+#include "gav1/symbol_visibility.h"
+
+// The callback functions use the C linkage conventions.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// This structure represents an allocated frame buffer.
+typedef struct Libgav1FrameBuffer {
+  // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2
+  // are for the Y, U, and V planes, respectively.
+  uint8_t* plane[3];   // Pointers to the frame (excluding the borders) in the
+                       // data buffers.
+  int stride[3];       // Row strides in bytes.
+  void* private_data;  // Frame buffer's private data. Available for use by the
+                       // release frame buffer callback. Also copied to the
+                       // |buffer_private_data| field of DecoderBuffer for use
+                       // by the consumer of a DecoderBuffer.
+} Libgav1FrameBuffer;
+
+// This callback is invoked by the decoder to provide information on the
+// subsequent frames in the video, until the next invocation of this callback
+// or the end of the video.
+//
+// |width| and |height| are the maximum frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// maximum left, right, top, and bottom border sizes in pixels.
+// |stride_alignment| specifies the alignment of the row stride in bytes.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+//
+// NOTE: This callback may be omitted if the information is not useful to the
+// application.
+typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+// This callback is invoked by the decoder to allocate a frame buffer, which
+// consists of three data buffers, for the Y, U, and V planes, respectively.
+//
+// The callback must set |frame_buffer->plane[i]| to point to the data buffers
+// of the planes, and set |frame_buffer->stride[i]| to the row strides of the
+// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback
+// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null
+// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to
+// 0. The callback may set |frame_buffer->private_data| to a value that will
+// be useful to the release frame buffer callback and the consumer of a
+// DecoderBuffer.
+//
+// Returns kLibgav1StatusOk on success, an error status on failure.
+
+// |width| and |height| are the frame width and height in pixels.
+// |left_border|, |right_border|, |top_border|, and |bottom_border| are the
+// left, right, top, and bottom border sizes in pixels. |stride_alignment|
+// specifies the alignment of the row stride in bytes.
+typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+// After a frame buffer is allocated, the decoder starts to write decoded video
+// to the frame buffer. When the frame buffer is ready for consumption, it is
+// made available to the application in a Decoder::DequeueFrame() call.
+// Afterwards, the decoder may continue to use the frame buffer in read-only
+// mode. When the decoder is finished using the frame buffer, it notifies the
+// application by calling the Libgav1ReleaseFrameBufferCallback.
+
+// This callback is invoked by the decoder to release a frame buffer.
+typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data,
+                                                  void* buffer_private_data);
+
+// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to
+// help clients implement frame buffer callbacks using memory buffers. First,
+// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of
+// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size
+// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and
+// buffer_private_data to Libgav1SetFrameBuffer().
+
+// This structure contains information useful for allocating memory for a frame
+// buffer.
+typedef struct Libgav1FrameBufferInfo {
+  size_t y_buffer_size;   // Size in bytes of the Y buffer.
+  size_t uv_buffer_size;  // Size in bytes of the U or V buffer.
+
+  // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use
+  // them directly.
+  int y_stride;            // Row stride in bytes of the Y buffer.
+  int uv_stride;           // Row stride in bytes of the U or V buffer.
+  size_t y_plane_offset;   // Offset in bytes of the frame (excluding the
+                           // borders) in the Y buffer.
+  size_t uv_plane_offset;  // Offset in bytes of the frame (excluding the
+                           // borders) in the U or V buffer.
+  int stride_alignment;    // The stride_alignment argument passed to
+                           // Libgav1ComputeFrameBufferInfo().
+} Libgav1FrameBufferInfo;
+
+// Computes the information useful for allocating memory for a frame buffer.
+// On success, stores the output in |info|.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBufferInfo* info);
+
+// Sets the |frame_buffer| struct.
+LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer(
+    const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer,
+    uint8_t* v_buffer, void* buffer_private_data,
+    Libgav1FrameBuffer* frame_buffer);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+// Declare type aliases for C++.
+namespace libgav1 {
+
+using FrameBuffer = Libgav1FrameBuffer;
+using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback;
+using GetFrameBufferCallback = Libgav1GetFrameBufferCallback;
+using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback;
+using FrameBufferInfo = Libgav1FrameBufferInfo;
+
+inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format,
+                                         int width, int height, int left_border,
+                                         int right_border, int top_border,
+                                         int bottom_border,
+                                         int stride_alignment,
+                                         FrameBufferInfo* info) {
+  return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height,
+                                       left_border, right_border, top_border,
+                                       bottom_border, stride_alignment, info);
+}
+
+inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer,
+                                 uint8_t* u_buffer, uint8_t* v_buffer,
+                                 void* buffer_private_data,
+                                 FrameBuffer* frame_buffer) {
+  return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer,
+                               buffer_private_data, frame_buffer);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_
diff --git a/src/gav1/status_code.h b/src/gav1/status_code.h
new file mode 100644
index 0000000..d7476ca
--- /dev/null
+++ b/src/gav1/status_code.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_
+
+#include "gav1/symbol_visibility.h"
+
+// All the declarations in this file are part of the public ABI. This file may
+// be included by both C and C++ files.
+
+// The Libgav1StatusCode enum type: A libgav1 function may return
+// Libgav1StatusCode to indicate success or the reason for failure.
+typedef enum {
+  // Success.
+  kLibgav1StatusOk = 0,
+
+  // An unknown error. Used as the default error status if error detail is not
+  // available.
+  kLibgav1StatusUnknownError = -1,
+
+  // An invalid function argument.
+  kLibgav1StatusInvalidArgument = -2,
+
+  // Memory allocation failure.
+  kLibgav1StatusOutOfMemory = -3,
+
+  // Ran out of a resource (other than memory).
+  kLibgav1StatusResourceExhausted = -4,
+
+  // The object is not initialized.
+  kLibgav1StatusNotInitialized = -5,
+
+  // An operation that can only be performed once has already been performed.
+  kLibgav1StatusAlready = -6,
+
+  // Not implemented, or not supported.
+  kLibgav1StatusUnimplemented = -7,
+
+  // An internal error in libgav1. Usually this indicates a programming error.
+  kLibgav1StatusInternalError = -8,
+
+  // The bitstream is not encoded correctly or violates a bitstream conformance
+  // requirement.
+  kLibgav1StatusBitstreamError = -9,
+
+  // The operation is not allowed at the moment. This is not a fatal error. Try
+  // again later.
+  kLibgav1StatusTryAgain = -10,
+
+  // Used only by DequeueFrame(). There are no enqueued frames, so there is
+  // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before
+  // trying to dequeue again.
+  kLibgav1StatusNothingToDequeue = -11,
+
+  // An extra enumerator to prevent people from writing code that fails to
+  // compile when a new status code is added.
+  //
+  // Do not reference this enumerator. In particular, if you write code that
+  // switches on Libgav1StatusCode, add a default: case instead of a case that
+  // mentions this enumerator.
+  //
+  // Do not depend on the value (currently -1000) listed here. It may change in
+  // the future.
+  kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000
+} Libgav1StatusCode;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Declare type aliases for C++.
+using StatusCode = Libgav1StatusCode;
+constexpr StatusCode kStatusOk = kLibgav1StatusOk;
+constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError;
+constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument;
+constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory;
+constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted;
+constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized;
+constexpr StatusCode kStatusAlready = kLibgav1StatusAlready;
+constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented;
+constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError;
+constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError;
+constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain;
+constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue;
+
+// Returns a human readable error string in en-US for the status code |status|.
+// Always returns a valid (non-NULL) string.
+inline const char* GetErrorString(StatusCode status) {
+  return Libgav1GetErrorString(status);
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_STATUS_CODE_H_
diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h
new file mode 100644
index 0000000..116a514
--- /dev/null
+++ b/src/gav1/symbol_visibility.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
+
+// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined
+// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts
+// symbol availability when users use the shared object form of libgav1. The
+// intent is to prevent exposure of libgav1 internals to users of the library,
+// and to avoid ABI compatibility problems that changes to libgav1 internals
+// would cause for users of the libgav1 shared object.
+//
+// Examples:
+//
+// This form makes a class and all of its members part of the public API:
+//
+// class LIBGAV1_PUBLIC A {
+//  public:
+//   A();
+//   ~A();
+//   void Foo();
+//   int Bar();
+// };
+//
+// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to
+// the shared object when this form is used.
+//
+// This form exposes a single class method as part of the public API:
+//
+// class B {
+//  public:
+//   B();
+//   ~B();
+//   LIBGAV1_PUBLIC int Foo();
+// };
+//
+// In this examples only B::Foo() is available to the user of the shared object.
+//
+// Non-class member functions can also be exposed individually:
+//
+// LIBGAV1_PUBLIC void Bar();
+//
+// In this example Bar() would be available to users of the shared object.
+//
+// Much of the above information and more can be found at
+// https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
+
+#if !defined(LIBGAV1_PUBLIC)
+#if defined(_WIN32)
+#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#if defined(__GNUC__)
+#define LIBGAV1_PUBLIC __attribute__((dllexport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllexport)
+#endif  // defined(__GNUC__)
+#elif defined(LIBGAV1_BUILDING_DLL)
+#ifdef __GNUC__
+#define LIBGAV1_PUBLIC __attribute__((dllimport))
+#else
+#define LIBGAV1_PUBLIC __declspec(dllimport)
+#endif  // defined(__GNUC__)
+#else
+#define LIBGAV1_PUBLIC
+#endif  // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
+#else   // !defined(_WIN32)
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
+#else
+#define LIBGAV1_PUBLIC
+#endif
+#endif  // defined(_WIN32)
+#endif  // defined(LIBGAV1_PUBLIC)
+
+#endif  // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_
diff --git a/src/gav1/version.h b/src/gav1/version.h
new file mode 100644
index 0000000..9bdc630
--- /dev/null
+++ b/src/gav1/version.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_GAV1_VERSION_H_
+#define LIBGAV1_SRC_GAV1_VERSION_H_
+
+#include "gav1/symbol_visibility.h"
+
+// This library follows the principles described by Semantic Versioning
+// (https://semver.org).
+
+#define LIBGAV1_MAJOR_VERSION 0
+#define LIBGAV1_MINOR_VERSION 17
+#define LIBGAV1_PATCH_VERSION 0
+
+#define LIBGAV1_VERSION                                           \
+  ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
+   LIBGAV1_PATCH_VERSION)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+LIBGAV1_PUBLIC int Libgav1GetVersion(void);
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void);
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void);
+
+#if defined(__cplusplus)
+}  // extern "C"
+
+namespace libgav1 {
+
+// Returns the library's version number, packed in an int using 8 bits for
+// each of major/minor/patch. e.g, 1.2.3 is 0x010203.
+inline int GetVersion() { return Libgav1GetVersion(); }
+
+// Returns the library's version number as a string in the format
+// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string.
+inline const char* GetVersionString() { return Libgav1GetVersionString(); }
+
+// Returns the build configuration used to produce the library. Always returns
+// a valid (non-NULL) string.
+inline const char* GetBuildConfiguration() {
+  return Libgav1GetBuildConfiguration();
+}
+
+}  // namespace libgav1
+#endif  // defined(__cplusplus)
+
+#endif  // LIBGAV1_SRC_GAV1_VERSION_H_
diff --git a/src/inter_intra_masks.inc b/src/inter_intra_masks.inc
new file mode 100644
index 0000000..2c15f9c
--- /dev/null
+++ b/src/inter_intra_masks.inc
@@ -0,0 +1,581 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the inter intra masks
+// from the code where it is used.
+
+// The tables in this file are computed based on section 7.11.3.13 in the spec.
+
+constexpr uint8_t kInterIntraMaskDc[] = {
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+constexpr uint8_t kInterIntraMaskVertical4x4[] = {
+    60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2};
+constexpr uint8_t kInterIntraMaskVertical4x8[] = {
+    60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11,
+    6,  6,  6,  6,  4,  4,  4,  4,  2,  2,  2,  2,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskVertical8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34,
+    19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11,
+    6,  6,  6,  6,  6,  6,  6,  6,  4,  4,  4,  4,  4,  4,  4,  4,
+    2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34,
+    34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19,
+    19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8,
+    8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,
+    5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,
+    3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskVertical16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskVertical32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskVertical32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2,
+                                                    60, 19, 6, 2, 60, 19, 6, 2};
+constexpr uint8_t kInterIntraMaskHorizontal4x8[] = {
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskHorizontal8x4[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x8[] = {
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1,
+    60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1};
+constexpr uint8_t kInterIntraMaskHorizontal8x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34,
+    26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15,
+    11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskHorizontal16x8[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x16[] = {
+    60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34,
+    26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15,
+    11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,
+    5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,
+    2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,
+    1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45,
+    34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,
+    1,  1,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal16x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskHorizontal32x16[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+constexpr uint8_t kInterIntraMaskHorizontal32x32[] = {
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,
+    3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,
+    1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,
+    4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,
+    2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,
+    2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,
+    1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,
+    1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19,
+                                                60, 19, 6,  6,  60, 19, 6,  2};
+constexpr uint8_t kInterIntraMaskSmooth4x8[] = {
+    60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11,
+    60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x4[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11};
+constexpr uint8_t kInterIntraMaskSmooth8x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34,
+    60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11,
+    60, 34, 19, 11, 6,  6,  6,  6,  60, 34, 19, 11, 6,  4,  4,  4,
+    60, 34, 19, 11, 6,  4,  2,  2,  60, 34, 19, 11, 6,  4,  2,  1};
+constexpr uint8_t kInterIntraMaskSmooth8x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34,
+    34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19,
+    19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60,
+    45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26,
+    19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11,
+    8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8,  60, 45,
+    34, 26, 19, 15, 11, 8,  60, 45, 34, 26, 19, 15, 11, 8};
+constexpr uint8_t kInterIntraMaskSmooth16x8[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8};
+constexpr uint8_t kInterIntraMaskSmooth16x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45,
+    34, 26, 19, 15, 11, 8,  8,  8,  8,  8,  8,  8,  8,  8,  60, 45, 34, 26, 19,
+    15, 11, 8,  6,  6,  6,  6,  6,  6,  6,  6,  60, 45, 34, 26, 19, 15, 11, 8,
+    6,  5,  5,  5,  5,  5,  5,  5,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,
+    4,  4,  4,  4,  4,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  3,  3,
+    3,  3,  60, 45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60,
+    45, 34, 26, 19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  2,  2,  60, 45, 34, 26,
+    19, 15, 11, 8,  6,  5,  4,  3,  2,  2,  1,  1,  60, 45, 34, 26, 19, 15, 11,
+    8,  6,  5,  4,  3,  2,  2,  1,  1};
+constexpr uint8_t kInterIntraMaskSmooth16x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52,
+    45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34,
+    30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13,
+    13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  8,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x16[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7};
+constexpr uint8_t kInterIntraMaskSmooth32x32[] = {
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
+    60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
+    52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+    45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+    34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30,
+    30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+    30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39,
+    34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19,
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+    19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52,
+    45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22,
+    19, 17, 15, 13, 11, 10, 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,
+    8,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  60,
+    52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26,
+    22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11,
+    10, 8,  7,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,
+    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30,
+    26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13,
+    11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+    3,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,
+    5,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  60, 52, 45, 39, 34,
+    30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15,
+    13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,
+    6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39,
+    34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,
+    2,  2,  2,  2,  2,  2,  2,  2,  2,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17,
+    15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,
+    1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,
+    6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45,
+    39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,
+    3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19,
+    17, 15, 13, 11, 10, 8,  7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,
+    1,  1,  1,  1,  60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8,
+    7,  6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1};
+
+// For each 2D array within this array, the indices are mapped as follows: 0, 1,
+// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32
+// respectively. For example, the entry in [1][2] corresponds to a prediction
+// size of 8x16 (width == 8 and height == 16).
+const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = {
+    // kInterIntraModeDc. This is a special case where all the non-nullptr
+    // entries point to kInterIntraMaskDc (all entries of the array are 32). The
+    // width can be set according to the prediction size to achieve the desired
+    // result.
+    {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr},
+     {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr},
+     {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc},
+     {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}},
+    // kInterIntraModeVertical
+    {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr},
+     {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8,
+      kInterIntraMaskVertical8x16, nullptr},
+     {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16,
+      kInterIntraMaskVertical16x32},
+     {nullptr, nullptr, kInterIntraMaskVertical32x16,
+      kInterIntraMaskVertical32x32}},
+    // kInterIntraModeHorizontal
+    {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr,
+      nullptr},
+     {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8,
+      kInterIntraMaskHorizontal8x16, nullptr},
+     {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16,
+      kInterIntraMaskHorizontal16x32},
+     {nullptr, nullptr, kInterIntraMaskHorizontal32x16,
+      kInterIntraMaskHorizontal32x32}},
+    // kInterIntraModeSmooth
+    {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr},
+     {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8,
+      kInterIntraMaskSmooth8x16, nullptr},
+     {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16,
+      kInterIntraMaskSmooth16x32},
+     {nullptr, nullptr, kInterIntraMaskSmooth32x16,
+      kInterIntraMaskSmooth32x32}}};
diff --git a/src/internal_frame_buffer_list.cc b/src/internal_frame_buffer_list.cc
new file mode 100644
index 0000000..e2d2273
--- /dev/null
+++ b/src/internal_frame_buffer_list.cc
@@ -0,0 +1,122 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+extern "C" {
+
+Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->OnFrameBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  return buffer_list->GetFrameBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                void* buffer_private_data) {
+  auto* buffer_list =
+      static_cast<InternalFrameBufferList*>(callback_private_data);
+  buffer_list->ReleaseFrameBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged(
+    int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) {
+  return kStatusOk;
+}
+
+StatusCode InternalFrameBufferList::GetFrameBuffer(
+    int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment, Libgav1FrameBuffer* frame_buffer) {
+  FrameBufferInfo info;
+  StatusCode status = ComputeFrameBufferInfo(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, &info);
+  if (status != kStatusOk) return status;
+
+  if (info.uv_buffer_size > SIZE_MAX / 2 ||
+      info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) {
+    return kStatusInvalidArgument;
+  }
+  const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size;
+
+  Buffer* buffer = nullptr;
+  for (auto& buffer_ptr : buffers_) {
+    if (!buffer_ptr->in_use) {
+      buffer = buffer_ptr.get();
+      break;
+    }
+  }
+  if (buffer == nullptr) {
+    std::unique_ptr<Buffer> new_buffer(new (std::nothrow) Buffer);
+    if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) {
+      return kStatusOutOfMemory;
+    }
+    buffer = buffers_.back().get();
+  }
+
+  if (buffer->size < min_size) {
+    std::unique_ptr<uint8_t[], MallocDeleter> new_data(
+        static_cast<uint8_t*>(malloc(min_size)));
+    if (new_data == nullptr) return kStatusOutOfMemory;
+    buffer->data = std::move(new_data);
+    buffer->size = min_size;
+  }
+
+  uint8_t* const y_buffer = buffer->data.get();
+  uint8_t* const u_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size;
+  uint8_t* const v_buffer =
+      (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size;
+  status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer,
+                                 frame_buffer);
+  if (status != kStatusOk) return status;
+  buffer->in_use = true;
+  return kStatusOk;
+}
+
+void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) {
+  auto* const buffer = static_cast<Buffer*>(buffer_private_data);
+  buffer->in_use = false;
+}
+
+}  // namespace libgav1
diff --git a/src/internal_frame_buffer_list.h b/src/internal_frame_buffer_list.h
new file mode 100644
index 0000000..1c50b48
--- /dev/null
+++ b/src/internal_frame_buffer_list.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/memory.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment);
+
+extern "C" Libgav1StatusCode GetInternalFrameBuffer(
+    void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format,
+    int width, int height, int left_border, int right_border, int top_border,
+    int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer);
+
+extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data,
+                                           void* buffer_private_data);
+
+class InternalFrameBufferList : public Allocable {
+ public:
+  InternalFrameBufferList() = default;
+
+  // Not copyable or movable.
+  InternalFrameBufferList(const InternalFrameBufferList&) = delete;
+  InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete;
+
+  ~InternalFrameBufferList() = default;
+
+  Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth,
+                                             Libgav1ImageFormat image_format,
+                                             int width, int height,
+                                             int left_border, int right_border,
+                                             int top_border, int bottom_border,
+                                             int stride_alignment);
+
+  Libgav1StatusCode GetFrameBuffer(int bitdepth,
+                                   Libgav1ImageFormat image_format, int width,
+                                   int height, int left_border,
+                                   int right_border, int top_border,
+                                   int bottom_border, int stride_alignment,
+                                   Libgav1FrameBuffer* frame_buffer);
+
+  void ReleaseFrameBuffer(void* buffer_private_data);
+
+ private:
+  struct Buffer : public Allocable {
+    std::unique_ptr<uint8_t[], MallocDeleter> data;
+    size_t size = 0;
+    bool in_use = false;
+  };
+
+  Vector<std::unique_ptr<Buffer>> buffers_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_
diff --git a/src/internal_frame_buffer_list_test.cc b/src/internal_frame_buffer_list_test.cc
new file mode 100644
index 0000000..21f1162
--- /dev/null
+++ b/src/internal_frame_buffer_list_test.cc
@@ -0,0 +1,158 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/internal_frame_buffer_list.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+class InternalFrameBufferListTest : public testing::Test {
+ protected:
+  static constexpr int kBufferListSize = 10;
+
+  InternalFrameBufferListTest() {
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
+    get_frame_buffer_ = GetInternalFrameBuffer;
+    release_frame_buffer_ = ReleaseInternalFrameBuffer;
+    callback_private_data_ = &buffer_list_;
+  }
+
+  // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
+  GetFrameBufferCallback get_frame_buffer_;
+  ReleaseFrameBufferCallback release_frame_buffer_;
+  // Private data associated with the frame buffer callbacks.
+  void* callback_private_data_;
+
+ private:
+  InternalFrameBufferList buffer_list_;
+};
+
+TEST_F(InternalFrameBufferListTest, ReleaseInRandomOrder) {
+  const int bitdepth = 8;
+  const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+  const int width = 100;
+  const int height = 50;
+  const int left_border = 0;
+  const int right_border = 0;
+  const int top_border = 0;
+  const int bottom_border = 0;
+  const int stride_alignment = 16;
+
+  EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                          image_format, width, height,
+                                          left_border, right_border, top_border,
+                                          bottom_border, stride_alignment),
+            0);
+
+  FrameBuffer frame_buffers[kBufferListSize];
+  for (auto& frame_buffer : frame_buffers) {
+    EXPECT_EQ(
+        get_frame_buffer_(callback_private_data_, bitdepth, image_format, width,
+                          height, left_border, right_border, top_border,
+                          bottom_border, stride_alignment, &frame_buffer),
+        0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], 112);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1], 64);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2], 64);
+  }
+
+  // Release and get a few buffers at indexes <= 5 in random order.
+  static_assert(5 < kBufferListSize, "");
+  static constexpr int indexes[] = {1, 4, 5, 5, 4, 3, 2, 3, 5, 0};
+  for (int index : indexes) {
+    release_frame_buffer_(callback_private_data_,
+                          frame_buffers[index].private_data);
+
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                width, height, left_border, right_border,
+                                top_border, bottom_border, stride_alignment,
+                                &frame_buffers[index]),
+              0);
+    EXPECT_NE(frame_buffers[index].plane[0], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[0], 112);
+    EXPECT_NE(frame_buffers[index].plane[1], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[1], 64);
+    EXPECT_NE(frame_buffers[index].plane[2], nullptr);
+    EXPECT_GE(frame_buffers[index].stride[2], 64);
+  }
+
+  for (auto& frame_buffer : frame_buffers) {
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+}
+
+TEST_F(InternalFrameBufferListTest, VaryingBufferSizes) {
+  const int bitdepth = 8;
+  const Libgav1ImageFormat image_format = kLibgav1ImageFormatYuv420;
+  const int width = 64;
+  const int height = 48;
+  const int left_border = 16;
+  const int right_border = 16;
+  const int top_border = 16;
+  const int bottom_border = 16;
+  const int stride_alignment = 16;
+
+  EXPECT_EQ(on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                          image_format, 16 * width, 16 * height,
+                                          left_border, right_border, top_border,
+                                          bottom_border, stride_alignment),
+            0);
+
+  FrameBuffer frame_buffer;
+  for (int i = 1; i <= 16; ++i) {
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                i * width, i * height, left_border,
+                                right_border, top_border, bottom_border,
+                                stride_alignment, &frame_buffer),
+              0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1],
+              (i * width + left_border + right_border) >> 1);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2],
+              (i * width + left_border + right_border) >> 1);
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+  for (int i = 16; i >= 1; --i) {
+    EXPECT_EQ(get_frame_buffer_(callback_private_data_, bitdepth, image_format,
+                                i * width, i * height, left_border,
+                                right_border, top_border, bottom_border,
+                                stride_alignment, &frame_buffer),
+              0);
+    EXPECT_NE(frame_buffer.plane[0], nullptr);
+    EXPECT_GE(frame_buffer.stride[0], i * width + left_border + right_border);
+    EXPECT_NE(frame_buffer.plane[1], nullptr);
+    EXPECT_GE(frame_buffer.stride[1],
+              (i * width + left_border + right_border) >> 1);
+    EXPECT_NE(frame_buffer.plane[2], nullptr);
+    EXPECT_GE(frame_buffer.stride[2],
+              (i * width + left_border + right_border) >> 1);
+    release_frame_buffer_(callback_private_data_, frame_buffer.private_data);
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake
new file mode 100644
index 0000000..b97d09d
--- /dev/null
+++ b/src/libgav1_decoder.cmake
@@ -0,0 +1,157 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_)
+  return()
+endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_
+set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1)
+
+list(APPEND libgav1_decoder_sources
+            "${libgav1_source}/buffer_pool.cc"
+            "${libgav1_source}/buffer_pool.h"
+            "${libgav1_source}/decoder_impl.cc"
+            "${libgav1_source}/decoder_impl.h"
+            "${libgav1_source}/decoder_state.h"
+            "${libgav1_source}/tile_scratch_buffer.cc"
+            "${libgav1_source}/tile_scratch_buffer.h"
+            "${libgav1_source}/film_grain.cc"
+            "${libgav1_source}/film_grain.h"
+            "${libgav1_source}/frame_buffer.cc"
+            "${libgav1_source}/frame_buffer_utils.h"
+            "${libgav1_source}/frame_scratch_buffer.h"
+            "${libgav1_source}/inter_intra_masks.inc"
+            "${libgav1_source}/internal_frame_buffer_list.cc"
+            "${libgav1_source}/internal_frame_buffer_list.h"
+            "${libgav1_source}/loop_restoration_info.cc"
+            "${libgav1_source}/loop_restoration_info.h"
+            "${libgav1_source}/motion_vector.cc"
+            "${libgav1_source}/motion_vector.h"
+            "${libgav1_source}/obu_parser.cc"
+            "${libgav1_source}/obu_parser.h"
+            "${libgav1_source}/post_filter/cdef.cc"
+            "${libgav1_source}/post_filter/deblock.cc"
+            "${libgav1_source}/post_filter/deblock_thresholds.inc"
+            "${libgav1_source}/post_filter/loop_restoration.cc"
+            "${libgav1_source}/post_filter/post_filter.cc"
+            "${libgav1_source}/post_filter/super_res.cc"
+            "${libgav1_source}/post_filter.h"
+            "${libgav1_source}/prediction_mask.cc"
+            "${libgav1_source}/prediction_mask.h"
+            "${libgav1_source}/quantizer.cc"
+            "${libgav1_source}/quantizer.h"
+            "${libgav1_source}/quantizer_tables.inc"
+            "${libgav1_source}/reconstruction.cc"
+            "${libgav1_source}/reconstruction.h"
+            "${libgav1_source}/residual_buffer_pool.cc"
+            "${libgav1_source}/residual_buffer_pool.h"
+            "${libgav1_source}/scan_tables.inc"
+            "${libgav1_source}/symbol_decoder_context.cc"
+            "${libgav1_source}/symbol_decoder_context.h"
+            "${libgav1_source}/symbol_decoder_context_cdfs.inc"
+            "${libgav1_source}/threading_strategy.cc"
+            "${libgav1_source}/threading_strategy.h"
+            "${libgav1_source}/tile.h"
+            "${libgav1_source}/tile/bitstream/mode_info.cc"
+            "${libgav1_source}/tile/bitstream/palette.cc"
+            "${libgav1_source}/tile/bitstream/partition.cc"
+            "${libgav1_source}/tile/bitstream/transform_size.cc"
+            "${libgav1_source}/tile/prediction.cc"
+            "${libgav1_source}/tile/tile.cc"
+            "${libgav1_source}/warp_prediction.cc"
+            "${libgav1_source}/warp_prediction.h"
+            "${libgav1_source}/yuv_buffer.cc"
+            "${libgav1_source}/yuv_buffer.h")
+
+list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h"
+            "${libgav1_source}/gav1/decoder_buffer.h"
+            "${libgav1_source}/gav1/decoder_settings.h"
+            "${libgav1_source}/gav1/frame_buffer.h"
+            "${libgav1_source}/gav1/status_code.h"
+            "${libgav1_source}/gav1/symbol_visibility.h"
+            "${libgav1_source}/gav1/version.h")
+
+list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc"
+            "${libgav1_source}/decoder_settings.cc"
+            "${libgav1_source}/status_code.cc"
+            "${libgav1_source}/version.cc"
+            ${libgav1_api_includes})
+
+macro(libgav1_add_decoder_targets)
+  if(BUILD_SHARED_LIBS)
+    if(MSVC OR WIN32)
+      # In order to produce a DLL and import library the Windows tools require
+      # that the exported symbols are part of the DLL target. The unfortunate
+      # side effect of this is that a single configuration cannot output both
+      # the static library and the DLL: This results in an either/or situation.
+      # Windows users of the libgav1 build can have a DLL and an import library,
+      # or they can have a static library; they cannot have both from a single
+      # configuration of the build.
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_includes})
+    else()
+      list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes})
+      list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+    endif()
+  else()
+    list(APPEND libgav1_static_lib_sources ${libgav1_api_sources})
+  endif()
+
+  if(NOT ANDROID)
+    list(APPEND libgav1_absl_deps absl::base absl::synchronization)
+  endif()
+
+  libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES
+                      ${libgav1_decoder_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME
+                      libgav1_static
+                      OUTPUT_NAME
+                      libgav1
+                      TYPE
+                      STATIC
+                      SOURCES
+                      ${libgav1_static_lib_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      LIB_DEPS
+                      ${libgav1_absl_deps}
+                      OBJLIB_DEPS
+                      libgav1_dsp
+                      libgav1_decoder
+                      libgav1_utils
+                      PUBLIC_INCLUDES
+                      ${libgav1_source})
+
+  if(BUILD_SHARED_LIBS)
+    libgav1_add_library(NAME
+                        libgav1_shared
+                        OUTPUT_NAME
+                        libgav1
+                        TYPE
+                        SHARED
+                        SOURCES
+                        ${libgav1_shared_lib_sources}
+                        DEFINES
+                        ${libgav1_defines}
+                        INCLUDES
+                        ${libgav1_include_paths}
+                        LIB_DEPS
+                        libgav1_static
+                        PUBLIC_INCLUDES
+                        ${libgav1_source})
+  endif()
+endmacro()
diff --git a/src/loop_restoration_info.cc b/src/loop_restoration_info.cc
new file mode 100644
index 0000000..8c17711
--- /dev/null
+++ b/src/loop_restoration_info.cc
@@ -0,0 +1,240 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/loop_restoration_info.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Controls how self guided deltas are read.
+constexpr int kSgrProjReadControl = 4;
+// Maps the restoration type encoded in the compressed headers (restoration_type
+// element in the spec) of the bitstream to LoopRestorationType. This is used
+// only when the restoration type in the frame header is
+// LoopRestorationTypeSwitchable.
+constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = {
+    kLoopRestorationTypeNone, kLoopRestorationTypeWiener,
+    kLoopRestorationTypeSgrProj};
+
+inline int CountLeadingZeroCoefficients(const int16_t* const filter) {
+  int number_zero_coefficients = 0;
+  if (filter[0] == 0) {
+    number_zero_coefficients++;
+    if (filter[1] == 0) {
+      number_zero_coefficients++;
+      if (filter[2] == 0) {
+        number_zero_coefficients++;
+      }
+    }
+  }
+  return number_zero_coefficients;
+}
+
+}  // namespace
+
+bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration,
+                                uint32_t width, uint32_t height,
+                                int8_t subsampling_x, int8_t subsampling_y,
+                                bool is_monochrome) {
+  loop_restoration_ = loop_restoration;
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes;
+  int total_num_units = 0;
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      plane_needs_filtering_[plane] = false;
+      continue;
+    }
+    plane_needs_filtering_[plane] = true;
+    const int plane_width =
+        (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
+    const int plane_height =
+        (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+    num_horizontal_units_[plane] =
+        std::max(1, RightShiftWithRounding(
+                        plane_width, loop_restoration_->unit_size_log2[plane]));
+    num_vertical_units_[plane] = std::max(
+        1, RightShiftWithRounding(plane_height,
+                                  loop_restoration_->unit_size_log2[plane]));
+    num_units_[plane] =
+        num_horizontal_units_[plane] * num_vertical_units_[plane];
+    total_num_units += num_units_[plane];
+  }
+  // Allocate the RestorationUnitInfo arrays for all planes in a single heap
+  // allocation and divide up the buffer into arrays of the right sizes.
+  if (!loop_restoration_info_buffer_.Resize(total_num_units)) {
+    return false;
+  }
+  RestorationUnitInfo* loop_restoration_info =
+      loop_restoration_info_buffer_.get();
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    loop_restoration_info_[plane] = loop_restoration_info;
+    loop_restoration_info += num_units_[plane];
+  }
+  return true;
+}
+
+bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock(
+    Plane plane, BlockSize block_size, bool is_superres_scaled,
+    uint8_t superres_scale_denominator, int row4x4, int column4x4,
+    LoopRestorationUnitInfo* const unit_info) const {
+  assert(unit_info != nullptr);
+  if (!plane_needs_filtering_[plane]) return false;
+  const int numerator_column =
+      is_superres_scaled ? superres_scale_denominator : 1;
+  const int pixel_column_start =
+      RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
+  const int pixel_column_end = RowOrColumn4x4ToPixel(
+      column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
+  const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+  const int denominator_column_log2 =
+      unit_row_log2 + (is_superres_scaled ? 3 : 0);
+  const int pixel_row_start =
+      RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
+  const int pixel_row_end = RowOrColumn4x4ToPixel(
+      row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
+  unit_info->column_start = RightShiftWithCeiling(
+      pixel_column_start * numerator_column, denominator_column_log2);
+  unit_info->column_end = RightShiftWithCeiling(
+      pixel_column_end * numerator_column, denominator_column_log2);
+  unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+  unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
+  unit_info->column_end =
+      std::min(unit_info->column_end, num_horizontal_units_[plane]);
+  unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
+  return true;
+}
+
+void LoopRestorationInfo::ReadUnitCoefficients(
+    EntropyDecoder* const reader,
+    SymbolDecoderContext* const symbol_decoder_context, Plane plane,
+    int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone;
+  if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) {
+    unit_restoration_type = kBitstreamRestorationTypeMap
+        [reader->ReadSymbol<kRestorationTypeSymbolCount>(
+            symbol_decoder_context->restoration_type_cdf)];
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) {
+    const bool use_wiener =
+        reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf);
+    if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener;
+  } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) {
+    const bool use_sgrproj =
+        reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf);
+    if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj;
+  }
+  loop_restoration_info_[plane][unit_id].type = unit_restoration_type;
+
+  if (unit_restoration_type == kLoopRestorationTypeWiener) {
+    ReadWienerInfo(reader, plane, unit_id, reference_unit_info);
+  } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) {
+    ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info);
+  }
+}
+
+void LoopRestorationInfo::ReadWienerInfo(
+    EntropyDecoder* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+    if (plane != kPlaneY) {
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0;
+    }
+    int sum = 0;
+    for (int j = static_cast<int>(plane != kPlaneY); j < kNumWienerCoefficients;
+         ++j) {
+      const int8_t wiener_min = kWienerTapsMin[j];
+      const int8_t wiener_max = kWienerTapsMax[j];
+      const int control = j + 1;
+      int value;
+      if (!reader->DecodeSignedSubexpWithReference(
+              wiener_min, wiener_max + 1,
+              (*reference_unit_info)[plane].wiener_info.filter[i][j], control,
+              &value)) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Error decoding Wiener filter coefficients: plane %d, unit_id %d",
+            static_cast<int>(plane), unit_id);
+        return;
+      }
+      loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value;
+      (*reference_unit_info)[plane].wiener_info.filter[i][j] = value;
+      sum += value;
+    }
+    loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] =
+        128 - 2 * sum;
+    loop_restoration_info_[plane][unit_id]
+        .wiener_info.number_leading_zero_coefficients[i] =
+        CountLeadingZeroCoefficients(
+            loop_restoration_info_[plane][unit_id].wiener_info.filter[i]);
+  }
+}
+
+void LoopRestorationInfo::ReadSgrProjInfo(
+    EntropyDecoder* const reader, Plane plane, int unit_id,
+    std::array<RestorationUnitInfo, kMaxPlanes>* const reference_unit_info) {
+  const int sgr_proj_index =
+      static_cast<int>(reader->ReadLiteral(kSgrProjParamsBits));
+  loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index;
+  for (int i = 0; i < 2; ++i) {
+    const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2];
+    const int8_t multiplier_min = kSgrProjMultiplierMin[i];
+    const int8_t multiplier_max = kSgrProjMultiplierMax[i];
+    int multiplier;
+    if (radius != 0) {
+      if (!reader->DecodeSignedSubexpWithReference(
+              multiplier_min, multiplier_max + 1,
+              (*reference_unit_info)[plane].sgr_proj_info.multiplier[i],
+              kSgrProjReadControl, &multiplier)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error decoding Self-guided filter coefficients: plane "
+                     "%d, unit_id %d",
+                     static_cast<int>(plane), unit_id);
+        return;
+      }
+    } else {
+      // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0]
+      // from DecodeSignedSubexpWithReference() is [-96, 31], the default is
+      // -32, making Clip3(128 - 31, -32, 95) unnecessary.
+      static constexpr int kMultiplier[2] = {0, 95};
+      multiplier = kMultiplier[i];
+      assert(
+          i == 0 ||
+          Clip3((1 << kSgrProjPrecisionBits) -
+                    (*reference_unit_info)[plane].sgr_proj_info.multiplier[0],
+                multiplier_min, multiplier_max) == kMultiplier[1]);
+    }
+    loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] =
+        multiplier;
+    (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/loop_restoration_info.h b/src/loop_restoration_info.h
new file mode 100644
index 0000000..bff6746
--- /dev/null
+++ b/src/loop_restoration_info.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/dsp/common.h"
+#include "src/symbol_decoder_context.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+struct LoopRestorationUnitInfo {
+  int row_start;
+  int row_end;
+  int column_start;
+  int column_end;
+};
+
+class LoopRestorationInfo {
+ public:
+  LoopRestorationInfo() = default;
+
+  // Non copyable/movable.
+  LoopRestorationInfo(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete;
+  LoopRestorationInfo(LoopRestorationInfo&&) = delete;
+  LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete;
+
+  bool Reset(const LoopRestoration* loop_restoration, uint32_t width,
+             uint32_t height, int8_t subsampling_x, int8_t subsampling_y,
+             bool is_monochrome);
+  // Populates the |unit_info| for the super block at |row4x4|, |column4x4|.
+  // Returns true on success, false otherwise.
+  bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size,
+                                     bool is_superres_scaled,
+                                     uint8_t superres_scale_denominator,
+                                     int row4x4, int column4x4,
+                                     LoopRestorationUnitInfo* unit_info) const;
+  void ReadUnitCoefficients(EntropyDecoder* reader,
+                            SymbolDecoderContext* symbol_decoder_context,
+                            Plane plane, int unit_id,
+                            std::array<RestorationUnitInfo, kMaxPlanes>*
+                                reference_unit_info);  // 5.11.58.
+  void ReadWienerInfo(
+      EntropyDecoder* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+  void ReadSgrProjInfo(
+      EntropyDecoder* reader, Plane plane, int unit_id,
+      std::array<RestorationUnitInfo, kMaxPlanes>* reference_unit_info);
+
+  // Getters.
+  const RestorationUnitInfo* loop_restoration_info(Plane plane,
+                                                   int unit_id) const {
+    return &loop_restoration_info_[plane][unit_id];
+  }
+
+  int num_horizontal_units(Plane plane) const {
+    return num_horizontal_units_[plane];
+  }
+  int num_vertical_units(Plane plane) const {
+    return num_vertical_units_[plane];
+  }
+  int num_units(Plane plane) const { return num_units_[plane]; }
+
+ private:
+  // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane]
+  // points to an array of num_units_[plane] elements.
+  RestorationUnitInfo* loop_restoration_info_[kMaxPlanes];
+  // Owns the memory that loop_restoration_info_[plane] points to.
+  DynamicBuffer<RestorationUnitInfo> loop_restoration_info_buffer_;
+  bool plane_needs_filtering_[kMaxPlanes];
+  const LoopRestoration* loop_restoration_;
+  int8_t subsampling_x_;
+  int8_t subsampling_y_;
+  int num_horizontal_units_[kMaxPlanes];
+  int num_vertical_units_[kMaxPlanes];
+  int num_units_[kMaxPlanes];
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_
diff --git a/src/motion_vector.cc b/src/motion_vector.cc
new file mode 100644
index 0000000..36018ab
--- /dev/null
+++ b/src/motion_vector.cc
@@ -0,0 +1,1000 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/motion_vector.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// Entry at index i is computed as:
+// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)).
+constexpr int kWarpValidThreshold[kMaxBlockSizes] = {
+    16, 16, 16, 16, 16, 16, 32, 16, 16,  16,  32,
+    64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112};
+
+// 7.10.2.10.
+void LowerMvPrecision(const ObuFrameHeader& frame_header,
+                      MotionVector* const mvs) {
+  if (frame_header.allow_high_precision_mv) return;
+  if (frame_header.force_integer_mv != 0) {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // const int value = (std::abs(static_cast<int>(mv)) + 3) & ~7;
+      // const int sign = mv >> 15;
+      // mv = ApplySign(value, sign);
+      mv = (mv + 3 - (mv >> 15)) & ~7;
+    }
+  } else {
+    for (auto& mv : mvs->mv) {
+      // The next line is equivalent to:
+      // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1;
+      mv = (mv - (mv >> 15)) & ~1;
+    }
+  }
+}
+
+// 7.10.2.1.
+void SetupGlobalMv(const Tile::Block& block, int index,
+                   MotionVector* const mv) {
+  const BlockParameters& bp = *block.bp;
+  const ObuFrameHeader& frame_header = block.tile.frame_header();
+  ReferenceFrameType reference_type = bp.reference_frame[index];
+  const auto& gm = frame_header.global_motion[reference_type];
+  if (reference_type == kReferenceFrameIntra ||
+      gm.type == kGlobalMotionTransformationTypeIdentity) {
+    mv->mv32 = 0;
+    return;
+  }
+  if (gm.type == kGlobalMotionTransformationTypeTranslation) {
+    for (int i = 0; i < 2; ++i) {
+      mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
+    }
+    LowerMvPrecision(frame_header, mv);
+    return;
+  }
+  const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1;
+  const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1;
+  const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x +
+                 gm.params[3] * y + gm.params[0];
+  const int yc = gm.params[4] * x +
+                 (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y +
+                 gm.params[1];
+  if (frame_header.allow_high_precision_mv) {
+    mv->mv[0] = RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3);
+    mv->mv[1] = RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3);
+  } else {
+    mv->mv[0] = MultiplyBy2(
+        RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2));
+    mv->mv[1] = MultiplyBy2(
+        RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2));
+    LowerMvPrecision(frame_header, mv);
+  }
+}
+
+constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv,
+                                              kPredictionModeNewNewMv,
+                                              kPredictionModeNearNewMv,
+                                              kPredictionModeNewNearMv,
+                                              kPredictionModeNearestNewMv,
+                                              kPredictionModeNewNearestMv);
+
+// 7.10.2.8.
+void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                 int index, int weight, bool* const found_new_mv,
+                 bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  MotionVector candidate_mv;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  const auto global_motion_type = global_motion[bp.reference_frame[0]].type;
+  if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+    candidate_mv = prediction_parameters.global_mv[0];
+  } else {
+    candidate_mv = mv_bp.mv.mv[index];
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                                   [&candidate_mv](const MotionVector& ref_mv) {
+                                     return ref_mv.mv32 == candidate_mv.mv32;
+                                   });
+  if (result != ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
+                                         weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.9.
+void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp,
+                         int weight, bool* const found_new_mv,
+                         bool* const found_match, int* const num_mv_found) {
+  const BlockParameters& bp = *block.bp;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>& global_motion =
+      block.tile.frame_header().global_motion;
+  PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  // LowerMvPrecision() is not necessary, since the values in
+  // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it.
+  CompoundMotionVector candidate_mv = mv_bp.mv;
+  for (int i = 0; i < 2; ++i) {
+    const auto global_motion_type = global_motion[bp.reference_frame[i]].type;
+    if (IsGlobalMvBlock(mv_bp, global_motion_type)) {
+      candidate_mv.mv[i] = prediction_parameters.global_mv[i];
+    }
+  }
+  *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
+  *found_match = true;
+  CompoundMotionVector* const compound_ref_mv_stack =
+      prediction_parameters.compound_ref_mv_stack;
+  const int num_found = *num_mv_found;
+  const auto result =
+      std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                   [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                     return ref_mv.mv64 == candidate_mv.mv64;
+                   });
+  if (result != compound_ref_mv_stack + num_found) {
+    prediction_parameters.IncreaseWeight(
+        std::distance(compound_ref_mv_stack, result), weight);
+    return;
+  }
+  if (num_found >= kMaxRefMvStackSize) return;
+  compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
+  ++*num_mv_found;
+}
+
+// 7.10.2.7.
+void AddReferenceMvCandidate(const Tile::Block& block,
+                             const BlockParameters& mv_bp, bool is_compound,
+                             int weight, bool* const found_new_mv,
+                             bool* const found_match, int* const num_mv_found) {
+  if (!mv_bp.is_inter) return;
+  const BlockParameters& bp = *block.bp;
+  if (is_compound) {
+    if (mv_bp.reference_frame[0] == bp.reference_frame[0] &&
+        mv_bp.reference_frame[1] == bp.reference_frame[1]) {
+      CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match,
+                          num_mv_found);
+    }
+    return;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (mv_bp.reference_frame[i] == bp.reference_frame[0]) {
+      SearchStack(block, mv_bp, i, weight, found_new_mv, found_match,
+                  num_mv_found);
+    }
+  }
+}
+
+int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) {
+  assert(delta_row_or_column < 0);
+  if (block_width_or_height4x4 >= 16) return 4;
+  if (delta_row_or_column < -1) return 2;
+  return 0;
+}
+
+// 7.10.2.2.
+void ScanRow(const Tile::Block& block, int mv_column, int delta_row,
+             bool is_compound, bool* const found_new_mv,
+             bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const Tile& tile = block.tile;
+  if (!tile.IsTopInside(mv_row + 1)) return;
+  const int width4x4 = block.width4x4;
+  const int min_step = GetMinimumStep(width4x4, delta_row);
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + std::min({static_cast<int>(width4x4),
+                      tile.frame_header().columns4x4 - block.column4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(width4x4, static_cast<int>(kNum4x4BlocksWide[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.3.
+void ScanColumn(const Tile::Block& block, int mv_row, int delta_column,
+                bool is_compound, bool* const found_new_mv,
+                bool* const found_match, int* const num_mv_found) {
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsLeftInside(mv_column + 1)) return;
+  const int height4x4 = block.height4x4;
+  const int min_step = GetMinimumStep(height4x4, delta_column);
+  const ptrdiff_t stride = tile.BlockParametersStride();
+  BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column);
+  BlockParameters** const end_bps =
+      bps + stride * std::min({static_cast<int>(height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  do {
+    const BlockParameters& mv_bp = **bps;
+    const int step = std::max(
+        std::min(height4x4, static_cast<int>(kNum4x4BlocksHigh[mv_bp.size])),
+        min_step);
+    AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step),
+                            found_new_mv, found_match, num_mv_found);
+    bps += step * stride;
+  } while (bps < end_bps);
+}
+
+// 7.10.2.4.
+void ScanPoint(const Tile::Block& block, int delta_row, int delta_column,
+               bool is_compound, bool* const found_new_mv,
+               bool* const found_match, int* const num_mv_found) {
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] == kReferenceFrameNone) return;
+  AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv,
+                          found_match, num_mv_found);
+}
+
+// 7.10.2.6.
+void AddTemporalReferenceMvCandidate(
+    const ObuFrameHeader& frame_header, const int reference_offsets[2],
+    const MotionVector* const temporal_mvs,
+    const int8_t* const temporal_reference_offsets, int count, bool is_compound,
+    int* const zero_mv_context, int* const num_mv_found,
+    PredictionParameters* const prediction_parameters) {
+  const int mv_projection_function_index =
+      frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv;
+  const MotionVector* const global_mv = prediction_parameters->global_mv;
+  if (is_compound) {
+    alignas(kMaxAlignment)
+        CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+    const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+    dsp.mv_projection_compound[mv_projection_function_index](
+        temporal_mvs, temporal_reference_offsets, reference_offsets, count,
+        candidate_mvs);
+    if (*zero_mv_context == -1) {
+      int max_difference =
+          std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]),
+                   std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0]));
+      max_difference =
+          std::max(max_difference,
+                   std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters->compound_ref_mv_stack;
+    int num_found = *num_mv_found;
+    int index = 0;
+    do {
+      const CompoundMotionVector& candidate_mv = candidate_mvs[index];
+      const auto result =
+          std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                       [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                         return ref_mv.mv64 == candidate_mv.mv64;
+                       });
+      if (result != compound_ref_mv_stack + num_found) {
+        prediction_parameters->IncreaseWeight(
+            std::distance(compound_ref_mv_stack, result), 2);
+        continue;
+      }
+      if (num_found >= kMaxRefMvStackSize) continue;
+      compound_ref_mv_stack[num_found].mv64 = candidate_mv.mv64;
+      prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+      ++num_found;
+    } while (++index < count);
+    *num_mv_found = num_found;
+    return;
+  }
+  MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
+  if (reference_offsets[0] == 0) {
+    if (*zero_mv_context == -1) {
+      const int max_difference =
+          std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1]));
+      *zero_mv_context = static_cast<int>(max_difference >= 16);
+    }
+    const MotionVector candidate_mv = {};
+    const int num_found = *num_mv_found;
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv.mv32 == candidate_mv.mv32;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2 * count);
+      return;
+    }
+    if (num_found >= kMaxRefMvStackSize) return;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
+    ++*num_mv_found;
+    return;
+  }
+  alignas(kMaxAlignment)
+      MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding];
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.mv_projection_single[mv_projection_function_index](
+      temporal_mvs, temporal_reference_offsets, reference_offsets[0], count,
+      candidate_mvs);
+  if (*zero_mv_context == -1) {
+    const int max_difference =
+        std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]),
+                 std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
+    *zero_mv_context = static_cast<int>(max_difference >= 16);
+  }
+  int num_found = *num_mv_found;
+  int index = 0;
+  do {
+    const MotionVector& candidate_mv = candidate_mvs[index];
+    const auto result =
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
+                     [&candidate_mv](const MotionVector& ref_mv) {
+                       return ref_mv.mv32 == candidate_mv.mv32;
+                     });
+    if (result != ref_mv_stack + num_found) {
+      prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
+                                            2);
+      continue;
+    }
+    if (num_found >= kMaxRefMvStackSize) continue;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+    ++num_found;
+  } while (++index < count);
+  *num_mv_found = num_found;
+}
+
+// Part of 7.10.2.5.
+bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row,
+                               int delta_column) {
+  const int row = (block.row4x4 & 15) + delta_row;
+  const int column = (block.column4x4 & 15) + delta_column;
+  // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|.
+  // So |row| are all non-negative.
+  assert(row >= 0);
+  return row < 16 && column >= 0 && column < 16;
+}
+
+constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32,
+                                       kBlock16x8, kBlock16x16, kBlock16x32,
+                                       kBlock32x8, kBlock32x16, kBlock32x32);
+
+// 7.10.2.5.
+void TemporalScan(const Tile::Block& block, bool is_compound,
+                  int* const zero_mv_context, int* const num_mv_found) {
+  const int step_w = (block.width4x4 >= 16) ? 4 : 2;
+  const int step_h = (block.height4x4 >= 16) ? 4 : 2;
+  const int row_start = block.row4x4 | 1;
+  const int column_start = block.column4x4 | 1;
+  const int row_end =
+      row_start + std::min(static_cast<int>(block.height4x4), 16);
+  const int column_end =
+      column_start + std::min(static_cast<int>(block.width4x4), 16);
+  const Tile& tile = block.tile;
+  const TemporalMotionField& motion_field = tile.motion_field();
+  const int stride = motion_field.mv.columns();
+  const MotionVector* motion_field_mv = motion_field.mv[0];
+  const int8_t* motion_field_reference_offset =
+      motion_field.reference_offset[0];
+  alignas(kMaxAlignment)
+      MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding];
+  int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding];
+  int count = 0;
+  int offset = stride * (row_start >> 1);
+  int mv_row = row_start;
+  do {
+    int mv_column = column_start;
+    do {
+      // Both horizontal and vertical offsets are positive. Only bottom and
+      // right boundaries need to be checked.
+      if (tile.IsBottomRightInside(mv_row, mv_column)) {
+        const int x8 = mv_column >> 1;
+        const MotionVector temporal_mv = motion_field_mv[offset + x8];
+        if (temporal_mv.mv[0] == kInvalidMvValue) {
+          if (mv_row == row_start && mv_column == column_start) {
+            *zero_mv_context = 1;
+          }
+        } else {
+          temporal_mvs[count] = temporal_mv;
+          temporal_reference_offsets[count++] =
+              motion_field_reference_offset[offset + x8];
+        }
+      }
+      mv_column += step_w;
+    } while (mv_column < column_end);
+    offset += stride * step_h >> 1;
+    mv_row += step_h;
+  } while (mv_row < row_end);
+  if (kTemporalScanMask.Contains(block.size)) {
+    const int temporal_sample_positions[3][2] = {
+        {block.height4x4, -2},
+        {block.height4x4, block.width4x4},
+        {block.height4x4 - 2, block.width4x4}};
+    // Getting the address of an element in Array2D is slow. Precalculate the
+    // offsets.
+    int temporal_sample_offsets[3];
+    temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) +
+                                 ((column_start - 2) >> 1);
+    temporal_sample_offsets[1] =
+        temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1);
+    temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride;
+    for (int i = 0; i < 3; i++) {
+      const int row = temporal_sample_positions[i][0];
+      const int column = temporal_sample_positions[i][1];
+      if (!IsWithinTheSame64x64Block(block, row, column)) continue;
+      const int mv_row = row_start + row;
+      const int mv_column = column_start + column;
+      // IsWithinTheSame64x64Block() guarantees the reference block is inside
+      // the top and left boundary.
+      if (!tile.IsBottomRightInside(mv_row, mv_column)) continue;
+      const MotionVector temporal_mv =
+          motion_field_mv[temporal_sample_offsets[i]];
+      if (temporal_mv.mv[0] != kInvalidMvValue) {
+        temporal_mvs[count] = temporal_mv;
+        temporal_reference_offsets[count++] =
+            motion_field_reference_offset[temporal_sample_offsets[i]];
+      }
+    }
+  }
+  if (count != 0) {
+    BlockParameters* const bp = block.bp;
+    int reference_offsets[2];
+    const int offset_0 = tile.current_frame()
+                             .reference_info()
+                             ->relative_distance_to[bp->reference_frame[0]];
+    reference_offsets[0] =
+        Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance);
+    if (is_compound) {
+      const int offset_1 = tile.current_frame()
+                               .reference_info()
+                               ->relative_distance_to[bp->reference_frame[1]];
+      reference_offsets[1] =
+          Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance);
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      if ((count & 1) != 0) {
+        temporal_mvs[count].mv32 = 0;
+        temporal_reference_offsets[count] = 0;
+      }
+    } else {
+      // Pad so that SIMD implementations won't read uninitialized memory.
+      for (int i = count; i < ((count + 3) & ~3); ++i) {
+        temporal_mvs[i].mv32 = 0;
+        temporal_reference_offsets[i] = 0;
+      }
+    }
+    AddTemporalReferenceMvCandidate(
+        tile.frame_header(), reference_offsets, temporal_mvs,
+        temporal_reference_offsets, count, is_compound, zero_mv_context,
+        num_mv_found, &(*bp->prediction_parameters));
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row,
+                                 int mv_column, int* const ref_id_count,
+                                 MotionVector ref_id[2][2],
+                                 int* const ref_diff_count,
+                                 MotionVector ref_diff[2][2]) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    for (int j = 0; j < 2; ++j) {
+      MotionVector candidate_mv = bp.mv.mv[i];
+      const ReferenceFrameType block_reference_frame =
+          block.bp->reference_frame[j];
+      if (candidate_reference_frame == block_reference_frame &&
+          ref_id_count[j] < 2) {
+        ref_id[j][ref_id_count[j]] = candidate_mv;
+        ++ref_id_count[j];
+      } else if (ref_diff_count[j] < 2) {
+        if (reference_frame_sign_bias[candidate_reference_frame] !=
+            reference_frame_sign_bias[block_reference_frame]) {
+          candidate_mv.mv[0] *= -1;
+          candidate_mv.mv[1] *= -1;
+        }
+        ref_diff[j][ref_diff_count[j]] = candidate_mv;
+        ++ref_diff_count[j];
+      }
+    }
+  }
+}
+
+// Part of 7.10.2.13.
+void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row,
+                               int mv_column, int* const num_mv_found) {
+  const auto& bp = block.tile.Parameters(mv_row, mv_column);
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias =
+      block.tile.reference_frame_sign_bias();
+  const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0];
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  int num_found = *num_mv_found;
+  for (int i = 0; i < 2; ++i) {
+    const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
+    if (candidate_reference_frame <= kReferenceFrameIntra) continue;
+    MotionVector candidate_mv = bp.mv.mv[i];
+    if (reference_frame_sign_bias[candidate_reference_frame] !=
+        reference_frame_sign_bias[block_reference_frame]) {
+      candidate_mv.mv[0] *= -1;
+      candidate_mv.mv[1] *= -1;
+    }
+    assert(num_found <= 2);
+    if ((num_found != 0 && ref_mv_stack[0].mv32 == candidate_mv.mv32) ||
+        (num_found == 2 && ref_mv_stack[1].mv32 == candidate_mv.mv32)) {
+      continue;
+    }
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+    ++num_found;
+  }
+  *num_mv_found = num_found;
+}
+
+// 7.10.2.12.
+void ExtraSearch(const Tile::Block& block, bool is_compound,
+                 int* const num_mv_found) {
+  const Tile& tile = block.tile;
+  const int num4x4 = std::min({static_cast<int>(block.width4x4),
+                               tile.frame_header().columns4x4 - block.column4x4,
+                               static_cast<int>(block.height4x4),
+                               tile.frame_header().rows4x4 - block.row4x4, 16});
+  int ref_id_count[2] = {};
+  MotionVector ref_id[2][2] = {};
+  int ref_diff_count[2] = {};
+  MotionVector ref_diff[2][2] = {};
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) {
+    for (int i = 0; i < num4x4;) {
+      const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i);
+      const int mv_column = block.column4x4 + ((pass == 0) ? i : -1);
+      if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break;
+      if (is_compound) {
+        AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count,
+                                    ref_id, ref_diff_count, ref_diff);
+      } else {
+        AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found);
+        if (*num_mv_found >= 2) break;
+      }
+      const auto& bp = tile.Parameters(mv_row, mv_column);
+      i +=
+          (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size];
+    }
+  }
+  if (is_compound) {
+    // Merge compound mode extra search into mv stack.
+    CompoundMotionVector* const compound_ref_mv_stack =
+        prediction_parameters.compound_ref_mv_stack;
+    CompoundMotionVector combined_mvs[2] = {};
+    for (int i = 0; i < 2; ++i) {
+      int count = 0;
+      assert(ref_id_count[i] <= 2);
+      for (int j = 0; j < ref_id_count[i]; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_id[i][j];
+      }
+      for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) {
+        combined_mvs[count].mv[i] = ref_diff[i][j];
+      }
+      for (; count < 2; ++count) {
+        combined_mvs[count].mv[i] = prediction_parameters.global_mv[i];
+      }
+    }
+    if (*num_mv_found == 1) {
+      if (combined_mvs[0].mv64 == compound_ref_mv_stack[0].mv64) {
+        compound_ref_mv_stack[1].mv64 = combined_mvs[1].mv64;
+      } else {
+        compound_ref_mv_stack[1].mv64 = combined_mvs[0].mv64;
+      }
+      prediction_parameters.SetWeightIndexStackEntry(1, 0);
+    } else {
+      assert(*num_mv_found == 0);
+      for (int i = 0; i < 2; ++i) {
+        compound_ref_mv_stack[i].mv64 = combined_mvs[i].mv64;
+        prediction_parameters.SetWeightIndexStackEntry(i, 0);
+      }
+    }
+    *num_mv_found = 2;
+  } else {
+    // single prediction mode
+    MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+    for (int i = *num_mv_found; i < 2; ++i) {
+      ref_mv_stack[i] = prediction_parameters.global_mv[0];
+      prediction_parameters.SetWeightIndexStackEntry(i, 0);
+    }
+  }
+}
+
+void DescendingOrderTwo(int* const a, int* const b) {
+  if (*a < *b) {
+    std::swap(*a, *b);
+  }
+}
+
+// Comparator used for sorting candidate motion vectors in descending order of
+// their weights (as specified in 7.10.2.11).
+bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) {
+  return lhs > rhs;
+}
+
+void SortWeightIndexStack(const int size, const int sort_to_n,
+                          int16_t* const weight_index_stack) {
+  if (size <= 1) return;
+  if (size <= 3) {
+    // Specialize small sort sizes to speed up.
+    int weight_index_0 = weight_index_stack[0];
+    int weight_index_1 = weight_index_stack[1];
+    DescendingOrderTwo(&weight_index_0, &weight_index_1);
+    if (size == 3) {
+      int weight_index_2 = weight_index_stack[2];
+      DescendingOrderTwo(&weight_index_1, &weight_index_2);
+      DescendingOrderTwo(&weight_index_0, &weight_index_1);
+      weight_index_stack[2] = weight_index_2;
+    }
+    weight_index_stack[0] = weight_index_0;
+    weight_index_stack[1] = weight_index_1;
+    return;
+  }
+  if (sort_to_n == 1) {
+    // std::max_element() is not efficient. Find the max element in a loop.
+    int16_t max_element = weight_index_stack[0];
+    int i = 1;
+    do {
+      max_element = std::max(max_element, weight_index_stack[i]);
+    } while (++i < size);
+    weight_index_stack[0] = max_element;
+    return;
+  }
+  std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n],
+                    &weight_index_stack[size], CompareCandidateMotionVectors);
+}
+
+// 7.10.2.14 (part 2).
+void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches,
+                     int* new_mv_context, int* reference_mv_context) {
+  switch (nearest_matches) {
+    case 0:
+      *new_mv_context = std::min(total_matches, 1);
+      *reference_mv_context = total_matches;
+      break;
+    case 1:
+      *new_mv_context = 3 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 2 + total_matches;
+      break;
+    default:
+      *new_mv_context = 5 - static_cast<int>(found_new_mv);
+      *reference_mv_context = 5;
+      break;
+  }
+}
+
+// 7.10.4.2.
+void AddSample(const Tile::Block& block, int delta_row, int delta_column,
+               int* const num_warp_samples, int* const num_samples_scanned,
+               int candidates[kMaxLeastSquaresSamples][4]) {
+  if (*num_samples_scanned >= kMaxLeastSquaresSamples) return;
+  const int mv_row = block.row4x4 + delta_row;
+  const int mv_column = block.column4x4 + delta_column;
+  const Tile& tile = block.tile;
+  if (!tile.IsInside(mv_row, mv_column) ||
+      !tile.HasParameters(mv_row, mv_column)) {
+    return;
+  }
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column);
+  if (mv_bp.reference_frame[0] != bp.reference_frame[0] ||
+      mv_bp.reference_frame[1] != kReferenceFrameNone) {
+    return;
+  }
+  ++*num_samples_scanned;
+  const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size];
+  const int candidate_row = mv_row & ~(candidate_height4x4 - 1);
+  const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size];
+  const int candidate_column = mv_column & ~(candidate_width4x4 - 1);
+  const BlockParameters& candidate_bp =
+      tile.Parameters(candidate_row, candidate_column);
+  const int mv_diff_row =
+      std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]);
+  const int mv_diff_column =
+      std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]);
+  const bool is_valid =
+      mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size];
+  if (!is_valid && *num_samples_scanned > 1) {
+    return;
+  }
+  const int mid_y =
+      MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1;
+  const int mid_x =
+      MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1;
+  candidates[*num_warp_samples][0] = MultiplyBy8(mid_y);
+  candidates[*num_warp_samples][1] = MultiplyBy8(mid_x);
+  candidates[*num_warp_samples][2] =
+      MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0];
+  candidates[*num_warp_samples][3] =
+      MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1];
+  if (is_valid) ++*num_warp_samples;
+}
+
+// 7.9.2.
+// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0
+// or -1 so that it can be XORed and subtracted directly in ApplySign() and
+// corresponding SIMD implementations.
+bool MotionFieldProjection(
+    const ObuFrameHeader& frame_header,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign,
+    int y8_start, int y8_end, int x8_start, int x8_end,
+    TemporalMotionField* const motion_field) {
+  const int source_index =
+      frame_header.reference_frame_index[source - kReferenceFrameLast];
+  auto* const source_frame = reference_frames[source_index].get();
+  assert(source_frame != nullptr);
+  assert(dst_sign == 0 || dst_sign == -1);
+  if (source_frame->rows4x4() != frame_header.rows4x4 ||
+      source_frame->columns4x4() != frame_header.columns4x4 ||
+      IsIntraFrame(source_frame->frame_type())) {
+    return false;
+  }
+  assert(reference_to_current_with_sign >= -kMaxFrameDistance);
+  if (reference_to_current_with_sign > kMaxFrameDistance) return true;
+  const ReferenceInfo& reference_info = *source_frame->reference_info();
+  const dsp::Dsp& dsp = *dsp::GetDspTable(8);
+  dsp.motion_field_projection_kernel(
+      reference_info, reference_to_current_with_sign, dst_sign, y8_start,
+      y8_end, x8_start, x8_end, motion_field);
+  return true;
+}
+
+}  // namespace
+
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* const contexts) {
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]);
+  if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]);
+  bool found_new_mv = false;
+  bool found_row_match = false;
+  int num_mv_found = 0;
+  ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv,
+          &found_row_match, &num_mv_found);
+  bool found_column_match = false;
+  ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv,
+             &found_column_match, &num_mv_found);
+  if (std::max(block.width4x4, block.height4x4) <= 16) {
+    ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv,
+              &found_row_match, &num_mv_found);
+  }
+  const int nearest_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  prediction_parameters.nearest_mv_count = num_mv_found;
+  if (block.tile.frame_header().use_ref_frame_mvs) {
+    // Initialize to invalid value, and it will be set when temporal mv is zero.
+    contexts->zero_mv = -1;
+    TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found);
+  } else {
+    contexts->zero_mv = 0;
+  }
+  bool dummy_bool = false;
+  ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match,
+            &num_mv_found);
+  static constexpr int deltas[2] = {-3, -5};
+  for (int i = 0; i < 2; ++i) {
+    if (i == 0 || block.height4x4 > 1) {
+      ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1),
+              is_compound, &dummy_bool, &found_row_match, &num_mv_found);
+    }
+    if (i == 0 || block.width4x4 > 1) {
+      ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1),
+                 is_compound, &dummy_bool, &found_column_match, &num_mv_found);
+    }
+  }
+  if (num_mv_found < 2) {
+    ExtraSearch(block, is_compound, &num_mv_found);
+  } else {
+    // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv()
+    // and Tile::AssignInterMv(), and only do a partial sort to the max index we
+    // need. However, the speed gain is trivial.
+    // For intra case, only the first 1 or 2 mvs in the stack will be used.
+    // For inter case, |prediction_parameters.ref_mv_index| is at most 3.
+    // We only need to do the partial sort up to the first 4 mvs.
+    SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4,
+                         prediction_parameters.weight_index_stack);
+    // When there are 4 or more nearest mvs, the other mvs will not be used.
+    if (prediction_parameters.nearest_mv_count < 4) {
+      SortWeightIndexStack(
+          num_mv_found - prediction_parameters.nearest_mv_count,
+          4 - prediction_parameters.nearest_mv_count,
+          prediction_parameters.weight_index_stack +
+              prediction_parameters.nearest_mv_count);
+    }
+  }
+  prediction_parameters.ref_mv_count = num_mv_found;
+  const int total_matches =
+      static_cast<int>(found_row_match) + static_cast<int>(found_column_match);
+  ComputeContexts(found_new_mv, nearest_matches, total_matches,
+                  &contexts->new_mv, &contexts->reference_mv);
+  // The mv stack clamping process is in Tile::AssignIntraMv() and
+  // Tile::AssignInterMv(), and only up to two mvs are clamped.
+}
+
+void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples,
+                     int* const num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]) {
+  const Tile& tile = block.tile;
+  bool top_left = true;
+  bool top_right = true;
+  int step = 1;
+  if (block.top_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4 - 1, block.column4x4).size;
+    const int source_width4x4 = kNum4x4BlocksWide[source_size];
+    if (block.width4x4 <= source_width4x4) {
+      // The & here is equivalent to % since source_width4x4 is a power of two.
+      const int column_offset = -(block.column4x4 & (source_width4x4 - 1));
+      if (column_offset < 0) top_left = false;
+      if (column_offset + source_width4x4 > block.width4x4) top_right = false;
+      AddSample(block, -1, 0, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0;
+           i < std::min(static_cast<int>(block.width4x4),
+                        tile.frame_header().columns4x4 - block.column4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size;
+        step = std::min(static_cast<int>(block.width4x4),
+                        static_cast<int>(kNum4x4BlocksWide[source_size]));
+        AddSample(block, -1, i, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    BlockSize source_size =
+        tile.Parameters(block.row4x4, block.column4x4 - 1).size;
+    const int source_height4x4 = kNum4x4BlocksHigh[source_size];
+    if (block.height4x4 <= source_height4x4) {
+      const int row_offset = -(block.row4x4 & (source_height4x4 - 1));
+      if (row_offset < 0) top_left = false;
+      AddSample(block, 0, -1, num_warp_samples, num_samples_scanned,
+                candidates);
+    } else {
+      for (int i = 0; i < std::min(static_cast<int>(block.height4x4),
+                                   tile.frame_header().rows4x4 - block.row4x4);
+           i += step) {
+        source_size =
+            tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size;
+        step = std::min(static_cast<int>(block.height4x4),
+                        static_cast<int>(kNum4x4BlocksHigh[source_size]));
+        AddSample(block, i, -1, num_warp_samples, num_samples_scanned,
+                  candidates);
+      }
+    }
+  }
+  if (top_left) {
+    AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates);
+  }
+  if (top_right && block.size <= kBlock64x64) {
+    AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned,
+              candidates);
+  }
+  if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1;
+}
+
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* const motion_field) {
+  assert(frame_header.use_ref_frame_mvs);
+  const int y8_start = DivideBy2(row4x4_start);
+  const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4));
+  const int x8_start = DivideBy2(column4x4_start);
+  const int x8_end =
+      DivideBy2(std::min(column4x4_end, frame_header.columns4x4));
+  const int last_index = frame_header.reference_frame_index[0];
+  const ReferenceInfo& reference_info = *current_frame.reference_info();
+  if (!IsIntraFrame(reference_frames[last_index]->frame_type())) {
+    const int last_alternate_order_hint =
+        reference_frames[last_index]
+            ->reference_info()
+            ->order_hint[kReferenceFrameAlternate];
+    const int current_gold_order_hint =
+        reference_info.order_hint[kReferenceFrameGolden];
+    if (last_alternate_order_hint != current_gold_order_hint) {
+      const int reference_offset_last =
+          -reference_info.relative_distance_from[kReferenceFrameLast];
+      if (std::abs(reference_offset_last) <= kMaxFrameDistance) {
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameLast, reference_offset_last, -1,
+                              y8_start, y8_end, x8_start, x8_end, motion_field);
+      }
+    }
+  }
+  int ref_stamp = 1;
+  const int reference_offset_backward =
+      reference_info.relative_distance_from[kReferenceFrameBackward];
+  if (reference_offset_backward > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameBackward, reference_offset_backward,
+                            0, y8_start, y8_end, x8_start, x8_end,
+                            motion_field)) {
+    --ref_stamp;
+  }
+  const int reference_offset_alternate2 =
+      reference_info.relative_distance_from[kReferenceFrameAlternate2];
+  if (reference_offset_alternate2 > 0 &&
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameAlternate2,
+                            reference_offset_alternate2, 0, y8_start, y8_end,
+                            x8_start, x8_end, motion_field)) {
+    --ref_stamp;
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_alternate =
+        reference_info.relative_distance_from[kReferenceFrameAlternate];
+    if (reference_offset_alternate > 0 &&
+        MotionFieldProjection(frame_header, reference_frames,
+                              kReferenceFrameAlternate,
+                              reference_offset_alternate, 0, y8_start, y8_end,
+                              x8_start, x8_end, motion_field)) {
+      --ref_stamp;
+    }
+  }
+  if (ref_stamp >= 0) {
+    const int reference_offset_last2 =
+        -reference_info.relative_distance_from[kReferenceFrameLast2];
+    if (std::abs(reference_offset_last2) <= kMaxFrameDistance) {
+      MotionFieldProjection(frame_header, reference_frames,
+                            kReferenceFrameLast2, reference_offset_last2, -1,
+                            y8_start, y8_end, x8_start, x8_end, motion_field);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/motion_vector.h b/src/motion_vector.h
new file mode 100644
index 0000000..68d14fe
--- /dev/null
+++ b/src/motion_vector.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_
+#define LIBGAV1_SRC_MOTION_VECTOR_H_
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/obu_parser.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr bool IsGlobalMvBlock(const BlockParameters& bp,
+                               GlobalMotionTransformationType type) {
+  return (bp.y_mode == kPredictionModeGlobalMv ||
+          bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+         !IsBlockDimension4(bp.size) &&
+         type > kGlobalMotionTransformationTypeTranslation;
+}
+
+// The |contexts| output parameter may be null. If the caller does not need
+// the |contexts| output, pass nullptr as the argument.
+void FindMvStack(const Tile::Block& block, bool is_compound,
+                 MvContexts* contexts);  // 7.10.2
+
+void FindWarpSamples(const Tile::Block& block, int* num_warp_samples,
+                     int* num_samples_scanned,
+                     int candidates[kMaxLeastSquaresSamples][4]);  // 7.10.4.
+
+// Section 7.9.1 in the spec. But this is done per tile instead of for the whole
+// frame.
+void SetupMotionField(
+    const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame,
+    const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+        reference_frames,
+    int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end,
+    TemporalMotionField* motion_field);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_MOTION_VECTOR_H_
diff --git a/src/obu_parser.cc b/src/obu_parser.cc
new file mode 100644
index 0000000..445450b
--- /dev/null
+++ b/src/obu_parser.cc
@@ -0,0 +1,2876 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/motion_vector.h"
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+// 5.9.16.
+// Find the smallest value of k such that block_size << k is greater than or
+// equal to target.
+//
+// NOTE: TileLog2(block_size, target) is equal to
+//   CeilLog2(ceil((double)target / block_size))
+// where the division is a floating-point number division. (This equality holds
+// even when |target| is equal to 0.) In the special case of block_size == 1,
+// TileLog2(1, target) is equal to CeilLog2(target).
+int TileLog2(int block_size, int target) {
+  int k = 0;
+  for (; (block_size << k) < target; ++k) {
+  }
+  return k;
+}
+
+void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) {
+  level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2);
+  level->minor = level_bits & 3;
+}
+
+// This function assumes loop_filter is zero-initialized, so only it needs to
+// set the nonzero default values.
+void SetDefaultRefDeltas(LoopFilter* const loop_filter) {
+  loop_filter->ref_deltas[kReferenceFrameIntra] = 1;
+  loop_filter->ref_deltas[kReferenceFrameGolden] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate] = -1;
+  loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1;
+}
+
+bool InTemporalLayer(int operating_point_idc, int temporal_id) {
+  return ((operating_point_idc >> temporal_id) & 1) != 0;
+}
+
+bool InSpatialLayer(int operating_point_idc, int spatial_id) {
+  return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0;
+}
+
+// Returns the index of the last nonzero byte in the |data| buffer of |size|
+// bytes. If there is no nonzero byte in the |data| buffer, returns -1.
+int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) {
+  // Scan backward for a nonzero byte.
+  if (size > INT_MAX) return -1;
+  int i = static_cast<int>(size) - 1;
+  while (i >= 0 && data[i] == 0) {
+    --i;
+  }
+  return i;
+}
+
+// A cleanup helper class that releases the frame buffer reference held in
+// |frame| in the destructor.
+class RefCountedBufferPtrCleanup {
+ public:
+  explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
+      : frame_(*frame) {}
+
+  // Not copyable or movable.
+  RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
+  RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
+      delete;
+
+  ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+
+ private:
+  RefCountedBufferPtr& frame_;
+};
+
+}  // namespace
+
+bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const {
+  // Note that the operating_parameters field is not compared per Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info.
+  return memcmp(this, &old,
+                offsetof(ObuSequenceHeader, operating_parameters)) != 0;
+}
+
+// Macros to avoid repeated error checks in the parser code.
+#define OBU_LOG_AND_RETURN_FALSE                                            \
+  do {                                                                      \
+    LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \
+                 __func__);                                                 \
+    return false;                                                           \
+  } while (false)
+#define OBU_PARSER_FAIL         \
+  do {                          \
+    if (scratch == -1) {        \
+      OBU_LOG_AND_RETURN_FALSE; \
+    }                           \
+  } while (false)
+#define OBU_READ_BIT_OR_FAIL        \
+  scratch = bit_reader_->ReadBit(); \
+  OBU_PARSER_FAIL
+#define OBU_READ_LITERAL_OR_FAIL(n)      \
+  scratch = bit_reader_->ReadLiteral(n); \
+  OBU_PARSER_FAIL
+#define OBU_READ_UVLC_OR_FAIL(x)        \
+  do {                                  \
+    if (!bit_reader_->ReadUvlc(&(x))) { \
+      OBU_LOG_AND_RETURN_FALSE;         \
+    }                                   \
+  } while (false)
+
+bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  ColorConfig* const color_config = &sequence_header->color_config;
+  OBU_READ_BIT_OR_FAIL;
+  const bool high_bitdepth = scratch != 0;
+  if (sequence_header->profile == kProfile2 && high_bitdepth) {
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_twelve_bit = scratch != 0;
+    color_config->bitdepth = is_twelve_bit ? 12 : 10;
+  } else {
+    color_config->bitdepth = high_bitdepth ? 10 : 8;
+  }
+  if (sequence_header->profile == kProfile1) {
+    color_config->is_monochrome = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->is_monochrome = scratch != 0;
+  }
+  OBU_READ_BIT_OR_FAIL;
+  const bool color_description_present_flag = scratch != 0;
+  if (color_description_present_flag) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->color_primary = static_cast<ColorPrimary>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->transfer_characteristics =
+        static_cast<TransferCharacteristics>(scratch);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    color_config->matrix_coefficients =
+        static_cast<MatrixCoefficients>(scratch);
+  } else {
+    color_config->color_primary = kColorPrimaryUnspecified;
+    color_config->transfer_characteristics =
+        kTransferCharacteristicsUnspecified;
+    color_config->matrix_coefficients = kMatrixCoefficientsUnspecified;
+  }
+  if (color_config->is_monochrome) {
+    OBU_READ_BIT_OR_FAIL;
+    color_config->color_range = static_cast<ColorRange>(scratch);
+    // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it
+    // easy to allow monochrome to be supported in profile 0. Profile 0
+    // requires subsampling_x and subsampling_y to be 1.
+    color_config->subsampling_x = 1;
+    color_config->subsampling_y = 1;
+    color_config->chroma_sample_position = kChromaSamplePositionUnknown;
+  } else {
+    if (color_config->color_primary == kColorPrimaryBt709 &&
+        color_config->transfer_characteristics ==
+            kTransferCharacteristicsSrgb &&
+        color_config->matrix_coefficients == kMatrixCoefficientsIdentity) {
+      color_config->color_range = kColorRangeFull;
+      color_config->subsampling_x = 0;
+      color_config->subsampling_y = 0;
+      // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+      // See the table at the beginning of Section 6.4.1.
+      if (sequence_header->profile != kProfile1 &&
+          (sequence_header->profile != kProfile2 ||
+           color_config->bitdepth != 12)) {
+        LIBGAV1_DLOG(ERROR,
+                     "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+                     sequence_header->profile, color_config->bitdepth);
+        return false;
+      }
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      color_config->color_range = static_cast<ColorRange>(scratch);
+      if (sequence_header->profile == kProfile0) {
+        color_config->subsampling_x = 1;
+        color_config->subsampling_y = 1;
+      } else if (sequence_header->profile == kProfile1) {
+        color_config->subsampling_x = 0;
+        color_config->subsampling_y = 0;
+      } else {
+        if (color_config->bitdepth == 12) {
+          OBU_READ_BIT_OR_FAIL;
+          color_config->subsampling_x = scratch;
+          if (color_config->subsampling_x == 1) {
+            OBU_READ_BIT_OR_FAIL;
+            color_config->subsampling_y = scratch;
+          } else {
+            color_config->subsampling_y = 0;
+          }
+        } else {
+          color_config->subsampling_x = 1;
+          color_config->subsampling_y = 0;
+        }
+      }
+      if (color_config->subsampling_x == 1 &&
+          color_config->subsampling_y == 1) {
+        OBU_READ_LITERAL_OR_FAIL(2);
+        color_config->chroma_sample_position =
+            static_cast<ChromaSamplePosition>(scratch);
+      }
+    }
+    OBU_READ_BIT_OR_FAIL;
+    color_config->separate_uv_delta_q = scratch != 0;
+  }
+  if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity &&
+      (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) {
+    LIBGAV1_DLOG(ERROR,
+                 "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) "
+                 "and subsampling_y (%d) are not both 0.",
+                 color_config->subsampling_x, color_config->subsampling_y);
+    return false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->timing_info_present_flag = scratch != 0;
+  if (!sequence_header->timing_info_present_flag) return true;
+  TimingInfo* const info = &sequence_header->timing_info;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_tick = static_cast<uint32_t>(scratch);
+  if (info->num_units_in_tick == 0) {
+    LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->time_scale = static_cast<uint32_t>(scratch);
+  if (info->time_scale == 0) {
+    LIBGAV1_DLOG(ERROR, "time_scale is 0.");
+    return false;
+  }
+  OBU_READ_BIT_OR_FAIL;
+  info->equal_picture_interval = scratch != 0;
+  if (info->equal_picture_interval) {
+    OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture);
+    ++info->num_ticks_per_picture;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) {
+  if (!sequence_header->timing_info_present_flag) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_info_present_flag = scratch != 0;
+  if (!sequence_header->decoder_model_info_present_flag) return true;
+  DecoderModelInfo* const info = &sequence_header->decoder_model_info;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->encoder_decoder_buffer_delay_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(32);
+  info->num_units_in_decoding_tick = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->buffer_removal_time_length = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(5);
+  info->frame_presentation_time_length = 1 + scratch;
+  return true;
+}
+
+bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                         int index) {
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header->decoder_model_present_for_operating_point[index] =
+      scratch != 0;
+  if (!sequence_header->decoder_model_present_for_operating_point[index]) {
+    return true;
+  }
+  OperatingParameters* const params = &sequence_header->operating_parameters;
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->decoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(
+      sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length);
+  params->encoder_buffer_delay[index] = static_cast<uint32_t>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  params->low_delay_mode_flag[index] = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseSequenceHeader(bool seen_frame_header) {
+  ObuSequenceHeader sequence_header = {};
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(3);
+  if (scratch >= kMaxProfiles) {
+    LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast<int>(scratch));
+    return false;
+  }
+  sequence_header.profile = static_cast<BitstreamProfile>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.still_picture = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.reduced_still_picture_header = scratch != 0;
+  if (sequence_header.reduced_still_picture_header) {
+    if (!sequence_header.still_picture) {
+      LIBGAV1_DLOG(
+          ERROR, "reduced_still_picture_header is 1, but still_picture is 0.");
+      return false;
+    }
+    sequence_header.operating_points = 1;
+    sequence_header.operating_point_idc[0] = 0;
+    OBU_READ_LITERAL_OR_FAIL(5);
+    ParseBitStreamLevel(&sequence_header.level[0], scratch);
+  } else {
+    if (!ParseTimingInfo(&sequence_header) ||
+        !ParseDecoderModelInfo(&sequence_header)) {
+      return false;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const bool initial_display_delay_present_flag = scratch != 0;
+    OBU_READ_LITERAL_OR_FAIL(5);
+    sequence_header.operating_points = static_cast<int>(1 + scratch);
+    if (operating_point_ >= sequence_header.operating_points) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid operating point: %d (valid range is [0,%d] inclusive).",
+          operating_point_, sequence_header.operating_points - 1);
+      return false;
+    }
+    for (int i = 0; i < sequence_header.operating_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(12);
+      sequence_header.operating_point_idc[i] = static_cast<int>(scratch);
+      for (int j = 0; j < i; ++j) {
+        if (sequence_header.operating_point_idc[i] ==
+            sequence_header.operating_point_idc[j]) {
+          LIBGAV1_DLOG(ERROR,
+                       "operating_point_idc[%d] (%d) is equal to "
+                       "operating_point_idc[%d] (%d).",
+                       i, sequence_header.operating_point_idc[i], j,
+                       sequence_header.operating_point_idc[j]);
+          return false;
+        }
+      }
+      OBU_READ_LITERAL_OR_FAIL(5);
+      ParseBitStreamLevel(&sequence_header.level[i], scratch);
+      if (sequence_header.level[i].major > 3) {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.tier[i] = scratch;
+      }
+      if (sequence_header.decoder_model_info_present_flag &&
+          !ParseOperatingParameters(&sequence_header, i)) {
+        return false;
+      }
+      if (initial_display_delay_present_flag) {
+        OBU_READ_BIT_OR_FAIL;
+        if (scratch != 0) {
+          OBU_READ_LITERAL_OR_FAIL(4);
+          sequence_header.initial_display_delay[i] = 1 + scratch;
+        }
+      }
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_width_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(4);
+  sequence_header.frame_height_bits = 1 + scratch;
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits);
+  sequence_header.max_frame_width = static_cast<int32_t>(1 + scratch);
+  OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits);
+  sequence_header.max_frame_height = static_cast<int32_t>(1 + scratch);
+  if (!sequence_header.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.frame_id_numbers_present = scratch != 0;
+  }
+  if (sequence_header.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    sequence_header.delta_frame_id_length_bits = 2 + scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);
+    sequence_header.frame_id_length_bits =
+        sequence_header.delta_frame_id_length_bits + 1 + scratch;
+    // Section 6.8.2: It is a requirement of bitstream conformance that the
+    // number of bits needed to read display_frame_id does not exceed 16. This
+    // is equivalent to the constraint that idLen <= 16.
+    if (sequence_header.frame_id_length_bits > 16) {
+      LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.",
+                   sequence_header.frame_id_length_bits);
+      return false;
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.use_128x128_superblock = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_filter_intra = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_intra_edge_filter = scratch != 0;
+  if (sequence_header.reduced_still_picture_header) {
+    sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    sequence_header.force_integer_mv = kSelectIntegerMv;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_interintra_compound = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_masked_compound = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_warped_motion = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_dual_filter = scratch != 0;
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.enable_order_hint = scratch != 0;
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_jnt_comp = scratch != 0;
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.enable_ref_frame_mvs = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    sequence_header.choose_screen_content_tools = scratch != 0;
+    if (sequence_header.choose_screen_content_tools) {
+      sequence_header.force_screen_content_tools = kSelectScreenContentTools;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.force_screen_content_tools = scratch;
+    }
+    if (sequence_header.force_screen_content_tools > 0) {
+      OBU_READ_BIT_OR_FAIL;
+      sequence_header.choose_integer_mv = scratch != 0;
+      if (sequence_header.choose_integer_mv) {
+        sequence_header.force_integer_mv = kSelectIntegerMv;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        sequence_header.force_integer_mv = scratch;
+      }
+    } else {
+      sequence_header.force_integer_mv = kSelectIntegerMv;
+    }
+    if (sequence_header.enable_order_hint) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      sequence_header.order_hint_bits = 1 + scratch;
+      sequence_header.order_hint_shift_bits =
+          Mod32(32 - sequence_header.order_hint_bits);
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_superres = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_cdef = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.enable_restoration = scratch != 0;
+  if (!ParseColorConfig(&sequence_header)) return false;
+  OBU_READ_BIT_OR_FAIL;
+  sequence_header.film_grain_params_present = scratch != 0;
+  // Compare new sequence header with old sequence header.
+  if (has_sequence_header_ &&
+      sequence_header.ParametersChanged(sequence_header_)) {
+    // Between the frame header OBU and the last tile group OBU of the frame,
+    // do not allow the sequence header to change.
+    if (seen_frame_header) {
+      LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
+      return false;
+    }
+    sequence_header_changed_ = true;
+    decoder_state_.ClearReferenceFrames();
+  }
+  sequence_header_ = sequence_header;
+  if (!has_sequence_header_) {
+    sequence_header_changed_ = true;
+  }
+  has_sequence_header_ = true;
+  // Section 6.4.1: It is a requirement of bitstream conformance that if
+  // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
+  // all OBUs that follow this sequence header until the next sequence header.
+  extension_disallowed_ =
+      (sequence_header_.operating_point_idc[operating_point_] == 0);
+  return true;
+}
+
+// Marks reference frames as invalid for referencing when they are too far in
+// the past to be referenced by the frame id mechanism.
+void ObuParser::MarkInvalidReferenceFrames() {
+  // The current lower bound of the frame ids for reference frames.
+  int lower_bound = decoder_state_.current_frame_id -
+                    (1 << sequence_header_.delta_frame_id_length_bits);
+  // True if lower_bound is smaller than current_frame_id. False if lower_bound
+  // wraps around (in modular arithmetic) to the other side of current_frame_id.
+  bool lower_bound_is_smaller = true;
+  if (lower_bound <= 0) {
+    lower_bound += 1 << sequence_header_.frame_id_length_bits;
+    lower_bound_is_smaller = false;
+  }
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i];
+    if (lower_bound_is_smaller) {
+      if (reference_frame_id > decoder_state_.current_frame_id ||
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_frame[i] = nullptr;
+      }
+    } else {
+      if (reference_frame_id > decoder_state_.current_frame_id &&
+          reference_frame_id < lower_bound) {
+        decoder_state_.reference_frame[i] = nullptr;
+      }
+    }
+  }
+}
+
+bool ObuParser::ParseFrameSizeAndRenderSize() {
+  int64_t scratch;
+  // Frame Size.
+  if (frame_header_.frame_size_override_flag) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits);
+    frame_header_.width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits);
+    frame_header_.height = static_cast<int32_t>(1 + scratch);
+    if (frame_header_.width > sequence_header_.max_frame_width ||
+        frame_header_.height > sequence_header_.max_frame_height) {
+      LIBGAV1_DLOG(ERROR,
+                   "Frame dimensions are larger than the maximum values");
+      return false;
+    }
+  } else {
+    frame_header_.width = sequence_header_.max_frame_width;
+    frame_header_.height = sequence_header_.max_frame_height;
+  }
+  if (!ParseSuperResParametersAndComputeImageSize()) return false;
+
+  // Render Size.
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.render_and_frame_size_different = scratch != 0;
+  if (frame_header_.render_and_frame_size_different) {
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_width = static_cast<int32_t>(1 + scratch);
+    OBU_READ_LITERAL_OR_FAIL(16);
+    frame_header_.render_height = static_cast<int32_t>(1 + scratch);
+  } else {
+    frame_header_.render_width = frame_header_.upscaled_width;
+    frame_header_.render_height = frame_header_.height;
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseSuperResParametersAndComputeImageSize() {
+  int64_t scratch;
+  // SuperRes.
+  frame_header_.upscaled_width = frame_header_.width;
+  frame_header_.use_superres = false;
+  if (sequence_header_.enable_superres) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.use_superres = scratch != 0;
+  }
+  if (frame_header_.use_superres) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    // 9 is the smallest value for the denominator.
+    frame_header_.superres_scale_denominator = scratch + 9;
+    frame_header_.width =
+        (frame_header_.upscaled_width * kSuperResScaleNumerator +
+         (frame_header_.superres_scale_denominator / 2)) /
+        frame_header_.superres_scale_denominator;
+  } else {
+    frame_header_.superres_scale_denominator = kSuperResScaleNumerator;
+  }
+  assert(frame_header_.width != 0);
+  assert(frame_header_.height != 0);
+  // Check if multiplying upscaled_width by height would overflow.
+  assert(frame_header_.upscaled_width >= frame_header_.width);
+  if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) {
+    LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.",
+                 frame_header_.width, frame_header_.height);
+    return false;
+  }
+  frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1;
+  frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1;
+  return true;
+}
+
+bool ObuParser::ValidateInterFrameSize() const {
+  for (int index : frame_header_.reference_frame_index) {
+    const RefCountedBuffer* reference_frame =
+        decoder_state_.reference_frame[index].get();
+    if (2 * frame_header_.width < reference_frame->upscaled_width() ||
+        2 * frame_header_.height < reference_frame->frame_height() ||
+        frame_header_.width > 16 * reference_frame->upscaled_width() ||
+        frame_header_.height > 16 * reference_frame->frame_height()) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid inter frame size: width=%d, height=%d. Reference "
+                   "frame: index=%d, upscaled width=%d, height=%d.",
+                   frame_header_.width, frame_header_.height, index,
+                   reference_frame->upscaled_width(),
+                   reference_frame->frame_height());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseReferenceOrderHint() {
+  if (!frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_order_hint) {
+    return true;
+  }
+  int64_t scratch;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.reference_order_hint[i] = scratch;
+    if (frame_header_.reference_order_hint[i] !=
+        decoder_state_.reference_order_hint[i]) {
+      decoder_state_.reference_frame[i] = nullptr;
+    }
+  }
+  return true;
+}
+
+// static
+int ObuParser::FindLatestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindEarliestBackwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint >= current_frame_hint &&
+        hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindLatestForwardReference(
+    const int current_frame_hint,
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+    const std::array<bool, kNumReferenceFrameTypes>& used_frame) {
+  int ref = -1;
+  int latest_order_hint = INT_MIN;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (!used_frame[i] && hint < current_frame_hint &&
+        hint >= latest_order_hint) {
+      ref = i;
+      latest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// static
+int ObuParser::FindReferenceWithSmallestOutputOrder(
+    const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints) {
+  int ref = -1;
+  int earliest_order_hint = INT_MAX;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int hint = shifted_order_hints[i];
+    if (hint < earliest_order_hint) {
+      ref = i;
+      earliest_order_hint = hint;
+    }
+  }
+  return ref;
+}
+
+// Computes the elements in the frame_header_.reference_frame_index array
+// based on:
+// * the syntax elements last_frame_idx and gold_frame_idx, and
+// * the values stored within the decoder_state_.reference_order_hint array
+//   (these values represent the least significant bits of the expected output
+//   order of the frames).
+//
+// Frame type: {
+//       libgav1_name              spec_name              int
+//   kReferenceFrameLast,          LAST_FRAME              1
+//   kReferenceFrameLast2,         LAST2_FRAME             2
+//   kReferenceFrameLast3,         LAST3_FRAME             3
+//   kReferenceFrameGolden,        GOLDEN_FRAME            4
+//   kReferenceFrameBackward,      BWDREF_FRAME            5
+//   kReferenceFrameAlternate2,    ALTREF2_FRAME           6
+//   kReferenceFrameAlternate,     ALTREF_FRAME            7
+// }
+//
+// A typical case of a group of pictures (frames) in display order:
+// (However, more complex cases are possibly allowed in terms of
+// bitstream conformance.)
+//
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+// |         |         |         |         |         |         |         |
+//
+// 4         3         2         1   current_frame   5         6         7
+//
+bool ObuParser::SetFrameReferences(const int8_t last_frame_idx,
+                                   const int8_t gold_frame_idx) {
+  // Set the ref_frame_idx entries for kReferenceFrameLast and
+  // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize
+  // the other entries to -1.
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    reference_frame_index = -1;
+  }
+  frame_header_
+      .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] =
+      last_frame_idx;
+  frame_header_
+      .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] =
+      gold_frame_idx;
+
+  // used_frame records which reference frames have been used.
+  std::array<bool, kNumReferenceFrameTypes> used_frame;
+  used_frame.fill(false);
+  used_frame[last_frame_idx] = true;
+  used_frame[gold_frame_idx] = true;
+
+  assert(sequence_header_.order_hint_bits >= 1);
+  const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1);
+  // shifted_order_hints contains the expected output order shifted such that
+  // the current frame has hint equal to current_frame_hint.
+  std::array<int, kNumReferenceFrameTypes> shifted_order_hints;
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    const int relative_distance = GetRelativeDistance(
+        decoder_state_.reference_order_hint[i], frame_header_.order_hint,
+        sequence_header_.order_hint_shift_bits);
+    shifted_order_hints[i] = current_frame_hint + relative_distance;
+  }
+
+  // The expected output orders for kReferenceFrameLast and
+  // kReferenceFrameGolden.
+  const int last_order_hint = shifted_order_hints[last_frame_idx];
+  const int gold_order_hint = shifted_order_hints[gold_frame_idx];
+
+  // Section 7.8: It is a requirement of bitstream conformance that
+  // lastOrderHint and goldOrderHint are strictly less than curFrameHint.
+  if (last_order_hint >= current_frame_hint ||
+      gold_order_hint >= current_frame_hint) {
+    return false;
+  }
+
+  // Find a backward reference to the frame with highest output order. If
+  // found, set the kReferenceFrameAlternate reference to that backward
+  // reference.
+  int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints,
+                                        used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Find a backward reference to the closest frame. If found, set the
+  // kReferenceFrameBackward reference to that backward reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_
+        .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] =
+        ref;
+    used_frame[ref] = true;
+  }
+
+  // Set the kReferenceFrameAlternate2 reference to the next closest backward
+  // reference.
+  ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints,
+                                      used_frame);
+  if (ref >= 0) {
+    frame_header_.reference_frame_index[kReferenceFrameAlternate2 -
+                                        kReferenceFrameLast] = ref;
+    used_frame[ref] = true;
+  }
+
+  // The remaining references are set to be forward references in
+  // reverse chronological order.
+  static constexpr ReferenceFrameType
+      kRefFrameList[kNumInterReferenceFrameTypes - 2] = {
+          kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward,
+          kReferenceFrameAlternate2, kReferenceFrameAlternate};
+  for (const ReferenceFrameType ref_frame : kRefFrameList) {
+    if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] <
+        0) {
+      ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints,
+                                       used_frame);
+      if (ref >= 0) {
+        frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] =
+            ref;
+        used_frame[ref] = true;
+      }
+    }
+  }
+
+  // Finally, any remaining references are set to the reference frame with
+  // smallest output order.
+  ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints);
+  assert(ref >= 0);
+  for (int8_t& reference_frame_index : frame_header_.reference_frame_index) {
+    if (reference_frame_index < 0) {
+      reference_frame_index = ref;
+    }
+  }
+
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterParameters() {
+  LoopFilter* const loop_filter = &frame_header_.loop_filter;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc) {
+    SetDefaultRefDeltas(loop_filter);
+    return true;
+  }
+  // IsIntraFrame implies kPrimaryReferenceNone.
+  assert(!IsIntraFrame(frame_header_.frame_type) ||
+         frame_header_.primary_reference_frame == kPrimaryReferenceNone);
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. It is not
+    // necessary to set loop_filter->delta_enabled to true. See
+    // https://crbug.com/aomedia/2305.
+    SetDefaultRefDeltas(loop_filter);
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    const RefCountedBuffer* prev_frame =
+        decoder_state_.reference_frame[prev_frame_index].get();
+    loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas();
+    loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas();
+  }
+  int64_t scratch;
+  for (int i = 0; i < 2; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(6);
+    loop_filter->level[i] = scratch;
+  }
+  if (!sequence_header_.color_config.is_monochrome &&
+      (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) {
+    for (int i = 2; i < 4; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(6);
+      loop_filter->level[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(3);
+  loop_filter->sharpness = scratch;
+  OBU_READ_BIT_OR_FAIL;
+  loop_filter->delta_enabled = scratch != 0;
+  if (loop_filter->delta_enabled) {
+    OBU_READ_BIT_OR_FAIL;
+    loop_filter->delta_update = scratch != 0;
+    if (loop_filter->delta_update) {
+      for (auto& ref_delta : loop_filter->ref_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const bool update_ref_delta = scratch != 0;
+        if (update_ref_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          ref_delta = scratch_int;
+        }
+      }
+      for (auto& mode_delta : loop_filter->mode_deltas) {
+        OBU_READ_BIT_OR_FAIL;
+        const bool update_mode_delta = scratch != 0;
+        if (update_mode_delta) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          mode_delta = scratch_int;
+        }
+      }
+    }
+  } else {
+    loop_filter->delta_update = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) {
+  int64_t scratch;
+  *delta = 0;
+  OBU_READ_BIT_OR_FAIL;
+  const bool delta_coded = scratch != 0;
+  if (delta_coded) {
+    int scratch_int;
+    if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits.");
+      return false;
+    }
+    *delta = scratch_int;
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerParameters() {
+  int64_t scratch;
+  QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  OBU_READ_LITERAL_OR_FAIL(8);
+  quantizer->base_index = scratch;
+  if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false;
+  if (!sequence_header_.color_config.is_monochrome) {
+    bool diff_uv_delta = false;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_BIT_OR_FAIL;
+      diff_uv_delta = scratch != 0;
+    }
+    if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) ||
+        !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) {
+      return false;
+    }
+    if (diff_uv_delta) {
+      if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) ||
+          !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) {
+        return false;
+      }
+    } else {
+      quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU];
+      quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU];
+    }
+  }
+  OBU_READ_BIT_OR_FAIL;
+  quantizer->use_matrix = scratch != 0;
+  if (quantizer->use_matrix) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneY] = scratch;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    quantizer->matrix_level[kPlaneU] = scratch;
+    if (sequence_header_.color_config.separate_uv_delta_q) {
+      OBU_READ_LITERAL_OR_FAIL(4);
+      quantizer->matrix_level[kPlaneV] = scratch;
+    } else {
+      quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU];
+    }
+  }
+  return true;
+}
+
+// This method implements the following functions in the spec:
+// - segmentation_params()
+// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled
+//   arrays to all 0.
+// - part of load_previous(): Call load_segmentation_params().
+//
+// A careful analysis of the spec shows the part of setup_past_independence()
+// can be optimized away and the part of load_previous() only needs to be
+// invoked under a specific condition. Although the logic looks different from
+// the spec, it is equivalent and more efficient.
+bool ObuParser::ParseSegmentationParameters() {
+  int64_t scratch;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  OBU_READ_BIT_OR_FAIL;
+  segmentation->enabled = scratch != 0;
+  if (!segmentation->enabled) return true;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    segmentation->update_map = true;
+    segmentation->update_data = true;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_map = scratch != 0;
+    if (segmentation->update_map) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->temporal_update = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    segmentation->update_data = scratch != 0;
+    if (!segmentation->update_data) {
+      // Part of the load_previous() function in the spec.
+      const int prev_frame_index =
+          frame_header_
+              .reference_frame_index[frame_header_.primary_reference_frame];
+      decoder_state_.reference_frame[prev_frame_index]
+          ->GetSegmentationParameters(segmentation);
+      return true;
+    }
+  }
+  for (int8_t i = 0; i < kMaxSegments; ++i) {
+    for (int8_t j = 0; j < kSegmentFeatureMax; ++j) {
+      OBU_READ_BIT_OR_FAIL;
+      segmentation->feature_enabled[i][j] = scratch != 0;
+      if (segmentation->feature_enabled[i][j]) {
+        if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+          int scratch_int;
+          if (!bit_reader_->ReadInverseSignedLiteral(
+                  kSegmentationFeatureBits[j], &scratch_int)) {
+            LIBGAV1_DLOG(ERROR, "Not enough bits.");
+            return false;
+          }
+          segmentation->feature_data[i][j] =
+              Clip3(scratch_int, -kSegmentationFeatureMaxValues[j],
+                    kSegmentationFeatureMaxValues[j]);
+        } else {
+          if (kSegmentationFeatureBits[j] > 0) {
+            OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]);
+            segmentation->feature_data[i][j] = Clip3(
+                static_cast<int>(scratch), 0, kSegmentationFeatureMaxValues[j]);
+          } else {
+            segmentation->feature_data[i][j] = 0;
+          }
+        }
+        segmentation->last_active_segment_id = i;
+        if (j >= kSegmentFeatureReferenceFrame) {
+          segmentation->segment_id_pre_skip = true;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseQuantizerIndexDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.quantizer.base_index > 0) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.delta_q.present = scratch != 0;
+    if (frame_header_.delta_q.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_q.scale = scratch;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopFilterDeltaParameters() {
+  int64_t scratch;
+  if (frame_header_.delta_q.present) {
+    if (!frame_header_.allow_intrabc) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.present = scratch != 0;
+    }
+    if (frame_header_.delta_lf.present) {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.delta_lf.scale = scratch;
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.delta_lf.multi = scratch != 0;
+    }
+  }
+  return true;
+}
+
+void ObuParser::ComputeSegmentLosslessAndQIndex() {
+  frame_header_.coded_lossless = true;
+  Segmentation* const segmentation = &frame_header_.segmentation;
+  const QuantizerParameters* const quantizer = &frame_header_.quantizer;
+  for (int i = 0; i < kMaxSegments; ++i) {
+    segmentation->qindex[i] =
+        GetQIndex(*segmentation, i, quantizer->base_index);
+    segmentation->lossless[i] =
+        segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 &&
+        quantizer->delta_dc[kPlaneU] == 0 &&
+        quantizer->delta_ac[kPlaneU] == 0 &&
+        quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0;
+    if (!segmentation->lossless[i]) frame_header_.coded_lossless = false;
+    // The spec calls for setting up a two-dimensional SegQMLevel array here.
+    // We avoid the SegQMLevel array by using segmentation->lossless[i] and
+    // quantizer->matrix_level[plane] directly in the reconstruct process of
+    // Section 7.12.3.
+  }
+  frame_header_.upscaled_lossless =
+      frame_header_.coded_lossless &&
+      frame_header_.width == frame_header_.upscaled_width;
+}
+
+bool ObuParser::ParseCdefParameters() {
+  const int coeff_shift = sequence_header_.color_config.bitdepth - 8;
+  if (frame_header_.coded_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_cdef) {
+    frame_header_.cdef.damping = 3 + coeff_shift;
+    return true;
+  }
+  Cdef* const cdef = &frame_header_.cdef;
+  int64_t scratch;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->damping = scratch + 3 + coeff_shift;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  cdef->bits = scratch;
+  for (int i = 0; i < (1 << cdef->bits); ++i) {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->y_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->y_secondary_strength[i] = scratch;
+    if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i];
+    cdef->y_secondary_strength[i] <<= coeff_shift;
+    if (sequence_header_.color_config.is_monochrome) continue;
+    OBU_READ_LITERAL_OR_FAIL(4);
+    cdef->uv_primary_strength[i] = scratch << coeff_shift;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    cdef->uv_secondary_strength[i] = scratch;
+    if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i];
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseLoopRestorationParameters() {
+  if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc ||
+      !sequence_header_.enable_restoration) {
+    return true;
+  }
+  int64_t scratch;
+  bool uses_loop_restoration = false;
+  bool uses_chroma_loop_restoration = false;
+  LoopRestoration* const loop_restoration = &frame_header_.loop_restoration;
+  const int num_planes = sequence_header_.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  for (int i = 0; i < num_planes; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(2);
+    loop_restoration->type[i] = static_cast<LoopRestorationType>(scratch);
+    if (loop_restoration->type[i] != kLoopRestorationTypeNone) {
+      uses_loop_restoration = true;
+      if (i > 0) uses_chroma_loop_restoration = true;
+    }
+  }
+  if (uses_loop_restoration) {
+    uint8_t unit_shift;
+    if (sequence_header_.use_128x128_superblock) {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch + 1;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      unit_shift = scratch;
+      if (unit_shift != 0) {
+        OBU_READ_BIT_OR_FAIL;
+        const uint8_t unit_extra_shift = scratch;
+        unit_shift += unit_extra_shift;
+      }
+    }
+    loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
+    uint8_t uv_shift = 0;
+    if (sequence_header_.color_config.subsampling_x != 0 &&
+        sequence_header_.color_config.subsampling_y != 0 &&
+        uses_chroma_loop_restoration) {
+      OBU_READ_BIT_OR_FAIL;
+      uv_shift = scratch;
+    }
+    loop_restoration->unit_size_log2[kPlaneU] =
+        loop_restoration->unit_size_log2[kPlaneV] =
+            loop_restoration->unit_size_log2[0] - uv_shift;
+  }
+  return true;
+}
+
+bool ObuParser::ParseTxModeSyntax() {
+  if (frame_header_.coded_lossless) {
+    frame_header_.tx_mode = kTxModeOnly4x4;
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest;
+  return true;
+}
+
+bool ObuParser::ParseFrameReferenceModeSyntax() {
+  int64_t scratch;
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.reference_mode_select = scratch != 0;
+  }
+  return true;
+}
+
+bool ObuParser::IsSkipModeAllowed() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      !frame_header_.reference_mode_select ||
+      !sequence_header_.enable_order_hint) {
+    return false;
+  }
+  // Identify the nearest forward and backward references.
+  int forward_index = -1;
+  int backward_index = -1;
+  int forward_hint = -1;
+  int backward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    // TODO(linfengz): |relative_distance| equals
+    // current_frame_->reference_info()->
+    //     relative_distance_from[i + kReferenceFrameLast];
+    // However, the unit test ObuParserTest.SkipModeParameters() would fail.
+    // Will figure out how to initialize |current_frame_.reference_info_| in the
+    // RefCountedBuffer later.
+    const int relative_distance =
+        GetRelativeDistance(reference_hint, frame_header_.order_hint,
+                            sequence_header_.order_hint_shift_bits);
+    if (relative_distance < 0) {
+      if (forward_index < 0 ||
+          GetRelativeDistance(reference_hint, forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        forward_index = i;
+        forward_hint = reference_hint;
+      }
+    } else if (relative_distance > 0) {
+      if (backward_index < 0 ||
+          GetRelativeDistance(reference_hint, backward_hint,
+                              sequence_header_.order_hint_shift_bits) < 0) {
+        backward_index = i;
+        backward_hint = reference_hint;
+      }
+    }
+  }
+  if (forward_index < 0) return false;
+  if (backward_index >= 0) {
+    // Bidirectional prediction.
+    frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::min(forward_index, backward_index));
+    frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+        kReferenceFrameLast + std::max(forward_index, backward_index));
+    return true;
+  }
+  // Forward prediction only. Identify the second nearest forward reference.
+  int second_forward_index = -1;
+  int second_forward_hint = -1;
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    const unsigned int reference_hint =
+        decoder_state_
+            .reference_order_hint[frame_header_.reference_frame_index[i]];
+    if (GetRelativeDistance(reference_hint, forward_hint,
+                            sequence_header_.order_hint_shift_bits) < 0) {
+      if (second_forward_index < 0 ||
+          GetRelativeDistance(reference_hint, second_forward_hint,
+                              sequence_header_.order_hint_shift_bits) > 0) {
+        second_forward_index = i;
+        second_forward_hint = reference_hint;
+      }
+    }
+  }
+  if (second_forward_index < 0) return false;
+  frame_header_.skip_mode_frame[0] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::min(forward_index, second_forward_index));
+  frame_header_.skip_mode_frame[1] = static_cast<ReferenceFrameType>(
+      kReferenceFrameLast + std::max(forward_index, second_forward_index));
+  return true;
+}
+
+bool ObuParser::ParseSkipModeParameters() {
+  if (!IsSkipModeAllowed()) return true;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.skip_mode_present = scratch != 0;
+  return true;
+}
+
+// Sets frame_header_.global_motion[ref].params[index].
+bool ObuParser::ParseGlobalParamSyntax(
+    int ref, int index,
+    const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+        prev_global_motions) {
+  GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+  const GlobalMotion* const prev_global_motion = &prev_global_motions[ref];
+  int abs_bits = kGlobalMotionAlphaBits;
+  int precision_bits = kGlobalMotionAlphaPrecisionBits;
+  if (index < 2) {
+    if (global_motion->type == kGlobalMotionTransformationTypeTranslation) {
+      const auto high_precision_mv_factor =
+          static_cast<int>(!frame_header_.allow_high_precision_mv);
+      abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor;
+      precision_bits =
+          kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor;
+    } else {
+      abs_bits = kGlobalMotionTranslationBits;
+      precision_bits = kGlobalMotionTranslationPrecisionBits;
+    }
+  }
+  const int precision_diff = kWarpedModelPrecisionBits - precision_bits;
+  const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+  const int sub = (index % 3 == 2) ? 1 << precision_bits : 0;
+  const int mx = 1 << abs_bits;
+  const int reference =
+      (prev_global_motion->params[index] >> precision_diff) - sub;
+  int scratch;
+  if (!bit_reader_->DecodeSignedSubexpWithReference(
+          -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits.");
+    return false;
+  }
+  global_motion->params[index] = LeftShift(scratch, precision_diff) + round;
+  return true;
+}
+
+bool ObuParser::ParseGlobalMotionParameters() {
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    frame_header_.global_motion[ref].type =
+        kGlobalMotionTransformationTypeIdentity;
+    for (int i = 0; i < 6; ++i) {
+      frame_header_.global_motion[ref].params[i] =
+          (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+    }
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) return true;
+  const std::array<GlobalMotion, kNumReferenceFrameTypes>* prev_global_motions =
+      nullptr;
+  if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) {
+    // Part of the setup_past_independence() function in the spec. The value
+    // that the spec says PrevGmParams[ref][i] should be set to is exactly
+    // the value frame_header_.global_motion[ref].params[i] is set to by the
+    // for loop above. Therefore prev_global_motions can simply point to
+    // frame_header_.global_motion.
+    prev_global_motions = &frame_header_.global_motion;
+  } else {
+    // Part of the load_previous() function in the spec.
+    const int prev_frame_index =
+        frame_header_
+            .reference_frame_index[frame_header_.primary_reference_frame];
+    prev_global_motions =
+        &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions();
+  }
+  for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) {
+    GlobalMotion* const global_motion = &frame_header_.global_motion[ref];
+    int64_t scratch;
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_global = scratch != 0;
+    if (is_global) {
+      OBU_READ_BIT_OR_FAIL;
+      const bool is_rot_zoom = scratch != 0;
+      if (is_rot_zoom) {
+        global_motion->type = kGlobalMotionTransformationTypeRotZoom;
+      } else {
+        OBU_READ_BIT_OR_FAIL;
+        const bool is_translation = scratch != 0;
+        global_motion->type = is_translation
+                                  ? kGlobalMotionTransformationTypeTranslation
+                                  : kGlobalMotionTransformationTypeAffine;
+      }
+    } else {
+      global_motion->type = kGlobalMotionTransformationTypeIdentity;
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) {
+      if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) {
+        return false;
+      }
+      if (global_motion->type == kGlobalMotionTransformationTypeAffine) {
+        if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) ||
+            !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) {
+          return false;
+        }
+      } else {
+        global_motion->params[4] = -global_motion->params[3];
+        global_motion->params[5] = global_motion->params[2];
+      }
+    }
+    if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) {
+      if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) ||
+          !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseFilmGrainParameters() {
+  if (!sequence_header_.film_grain_params_present ||
+      (!frame_header_.show_frame && !frame_header_.showable_frame)) {
+    // frame_header_.film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  FilmGrainParams& film_grain_params = frame_header_.film_grain_params;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.apply_grain = scratch != 0;
+  if (!film_grain_params.apply_grain) {
+    // film_grain_params is already zero-initialized.
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(16);
+  film_grain_params.grain_seed = static_cast<int>(scratch);
+  film_grain_params.update_grain = true;
+  if (frame_header_.frame_type == kFrameInter) {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.update_grain = scratch != 0;
+  }
+  if (!film_grain_params.update_grain) {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    film_grain_params.reference_index = static_cast<int>(scratch);
+    bool found = false;
+    for (const auto index : frame_header_.reference_frame_index) {
+      if (film_grain_params.reference_index == index) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      static_assert(sizeof(frame_header_.reference_frame_index) /
+                            sizeof(frame_header_.reference_frame_index[0]) ==
+                        7,
+                    "");
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid value for film_grain_params_ref_idx (%d). "
+                   "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}",
+                   film_grain_params.reference_index,
+                   frame_header_.reference_frame_index[0],
+                   frame_header_.reference_frame_index[1],
+                   frame_header_.reference_frame_index[2],
+                   frame_header_.reference_frame_index[3],
+                   frame_header_.reference_frame_index[4],
+                   frame_header_.reference_frame_index[5],
+                   frame_header_.reference_frame_index[6]);
+      return false;
+    }
+    const RefCountedBuffer* grain_params_reference_frame =
+        decoder_state_.reference_frame[film_grain_params.reference_index].get();
+    if (grain_params_reference_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                   film_grain_params.reference_index);
+      return false;
+    }
+    const int temp_grain_seed = film_grain_params.grain_seed;
+    const bool temp_update_grain = film_grain_params.update_grain;
+    const int temp_reference_index = film_grain_params.reference_index;
+    film_grain_params = grain_params_reference_frame->film_grain_params();
+    film_grain_params.grain_seed = temp_grain_seed;
+    film_grain_params.update_grain = temp_update_grain;
+    film_grain_params.reference_index = temp_reference_index;
+    return true;
+  }
+
+  OBU_READ_LITERAL_OR_FAIL(4);
+  film_grain_params.num_y_points = scratch;
+  if (film_grain_params.num_y_points > 14) {
+    LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).",
+                 film_grain_params.num_y_points);
+    return false;
+  }
+  for (int i = 0; i < film_grain_params.num_y_points; ++i) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_value[i] = scratch;
+    if (i != 0 && film_grain_params.point_y_value[i - 1] >=
+                      film_grain_params.point_y_value[i]) {
+      LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).",
+                   i - 1, film_grain_params.point_y_value[i - 1], i,
+                   film_grain_params.point_y_value[i]);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.point_y_scaling[i] = scratch;
+  }
+  if (sequence_header_.color_config.is_monochrome) {
+    film_grain_params.chroma_scaling_from_luma = false;
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    film_grain_params.chroma_scaling_from_luma = scratch != 0;
+  }
+  if (sequence_header_.color_config.is_monochrome ||
+      film_grain_params.chroma_scaling_from_luma ||
+      (sequence_header_.color_config.subsampling_x == 1 &&
+       sequence_header_.color_config.subsampling_y == 1 &&
+       film_grain_params.num_y_points == 0)) {
+    film_grain_params.num_u_points = 0;
+    film_grain_params.num_v_points = 0;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_u_points = scratch;
+    if (film_grain_params.num_u_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).",
+                   film_grain_params.num_u_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_u_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_u_value[i - 1] >=
+                        film_grain_params.point_u_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).",
+                     i - 1, film_grain_params.point_u_value[i - 1], i,
+                     film_grain_params.point_u_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_u_scaling[i] = scratch;
+    }
+    OBU_READ_LITERAL_OR_FAIL(4);
+    film_grain_params.num_v_points = scratch;
+    if (film_grain_params.num_v_points > 10) {
+      LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).",
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    if (sequence_header_.color_config.subsampling_x == 1 &&
+        sequence_header_.color_config.subsampling_y == 1 &&
+        (film_grain_params.num_u_points == 0) !=
+            (film_grain_params.num_v_points == 0)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Invalid values for num_u_points (%d) and num_v_points (%d) "
+                   "for 4:2:0 chroma subsampling.",
+                   film_grain_params.num_u_points,
+                   film_grain_params.num_v_points);
+      return false;
+    }
+    for (int i = 0; i < film_grain_params.num_v_points; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_value[i] = scratch;
+      if (i != 0 && film_grain_params.point_v_value[i - 1] >=
+                        film_grain_params.point_v_value[i]) {
+        LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).",
+                     i - 1, film_grain_params.point_v_value[i - 1], i,
+                     film_grain_params.point_v_value[i]);
+        return false;
+      }
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.point_v_scaling[i] = scratch;
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.chroma_scaling = scratch + 8;
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_coeff_lag = scratch;
+
+  const int num_pos_y =
+      MultiplyBy2(film_grain_params.auto_regression_coeff_lag) *
+      (film_grain_params.auto_regression_coeff_lag + 1);
+  int num_pos_uv = num_pos_y;
+  if (film_grain_params.num_y_points > 0) {
+    ++num_pos_uv;
+    for (int i = 0; i < num_pos_y; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_y[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_u_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_u[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  if (film_grain_params.chroma_scaling_from_luma ||
+      film_grain_params.num_v_points > 0) {
+    for (int i = 0; i < num_pos_uv; ++i) {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      film_grain_params.auto_regression_coeff_v[i] =
+          static_cast<int8_t>(scratch - 128);
+    }
+  }
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.auto_regression_shift = static_cast<uint8_t>(scratch + 6);
+  OBU_READ_LITERAL_OR_FAIL(2);
+  film_grain_params.grain_scale_shift = static_cast<int>(scratch);
+  if (film_grain_params.num_u_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.u_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.u_offset = static_cast<int16_t>(scratch - 256);
+  }
+  if (film_grain_params.num_v_points > 0) {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(8);
+    film_grain_params.v_luma_multiplier = static_cast<int8_t>(scratch - 128);
+    OBU_READ_LITERAL_OR_FAIL(9);
+    film_grain_params.v_offset = static_cast<int16_t>(scratch - 256);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.overlap_flag = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  film_grain_params.clip_to_restricted_range = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseTileInfoSyntax() {
+  TileInfo* const tile_info = &frame_header_.tile_info;
+  const int sb_columns = sequence_header_.use_128x128_superblock
+                             ? ((frame_header_.columns4x4 + 31) >> 5)
+                             : ((frame_header_.columns4x4 + 15) >> 4);
+  const int sb_rows = sequence_header_.use_128x128_superblock
+                          ? ((frame_header_.rows4x4 + 31) >> 5)
+                          : ((frame_header_.rows4x4 + 15) >> 4);
+  tile_info->sb_columns = sb_columns;
+  tile_info->sb_rows = sb_rows;
+  const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4;
+  const int sb_size = 2 + sb_shift;
+  const int sb_max_tile_width = kMaxTileWidth >> sb_size;
+  const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size);
+  const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns);
+  const int maxlog2_tile_columns =
+      CeilLog2(std::min(sb_columns, static_cast<int>(kMaxTileColumns)));
+  const int maxlog2_tile_rows =
+      CeilLog2(std::min(sb_rows, static_cast<int>(kMaxTileRows)));
+  const int min_log2_tiles = std::max(
+      minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns));
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  tile_info->uniform_spacing = scratch != 0;
+  if (tile_info->uniform_spacing) {
+    // Read tile columns.
+    tile_info->tile_columns_log2 = minlog2_tile_columns;
+    while (tile_info->tile_columns_log2 < maxlog2_tile_columns) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_columns_log2;
+    }
+
+    // Compute tile column starts.
+    const int sb_tile_width =
+        (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >>
+        tile_info->tile_columns_log2;
+    if (sb_tile_width <= 0) return false;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+
+    // Read tile rows.
+    const int minlog2_tile_rows =
+        std::max(min_log2_tiles - tile_info->tile_columns_log2, 0);
+    tile_info->tile_rows_log2 = minlog2_tile_rows;
+    while (tile_info->tile_rows_log2 < maxlog2_tile_rows) {
+      OBU_READ_BIT_OR_FAIL;
+      if (scratch == 0) break;
+      ++tile_info->tile_rows_log2;
+    }
+
+    // Compute tile row starts.
+    const int sb_tile_height =
+        (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >>
+        tile_info->tile_rows_log2;
+    if (sb_tile_height <= 0) return false;
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i++] = sb_start << sb_shift;
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+  } else {
+    int widest_tile_sb = 1;
+    int i = 0;
+    for (int sb_start = 0; sb_start < sb_columns; ++i) {
+      if (i >= kMaxTileColumns) {
+        LIBGAV1_DLOG(ERROR,
+                     "tile_columns would be greater than kMaxTileColumns.");
+        return false;
+      }
+      tile_info->tile_column_start[i] = sb_start << sb_shift;
+      const int max_width =
+          std::min(sb_columns - sb_start, static_cast<int>(sb_max_tile_width));
+      if (!bit_reader_->DecodeUniform(
+              max_width, &tile_info->tile_column_width_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_column_width_in_superblocks[i];
+      widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i],
+                                widest_tile_sb);
+      sb_start += tile_info->tile_column_width_in_superblocks[i];
+    }
+    tile_info->tile_column_start[i] = frame_header_.columns4x4;
+    tile_info->tile_columns = i;
+    tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns);
+
+    int max_tile_area_sb = sb_rows * sb_columns;
+    if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1;
+    const int max_tile_height_sb =
+        std::max(max_tile_area_sb / widest_tile_sb, 1);
+
+    i = 0;
+    for (int sb_start = 0; sb_start < sb_rows; ++i) {
+      if (i >= kMaxTileRows) {
+        LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows.");
+        return false;
+      }
+      tile_info->tile_row_start[i] = sb_start << sb_shift;
+      const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb);
+      if (!bit_reader_->DecodeUniform(
+              max_height, &tile_info->tile_row_height_in_superblocks[i])) {
+        LIBGAV1_DLOG(ERROR, "Not enough bits.");
+        return false;
+      }
+      ++tile_info->tile_row_height_in_superblocks[i];
+      sb_start += tile_info->tile_row_height_in_superblocks[i];
+    }
+    tile_info->tile_row_start[i] = frame_header_.rows4x4;
+    tile_info->tile_rows = i;
+    tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows);
+  }
+  tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns;
+  if (!tile_buffers_.reserve(tile_info->tile_count)) {
+    LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_.");
+    return false;
+  }
+  tile_info->context_update_id = 0;
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits != 0) {
+    OBU_READ_LITERAL_OR_FAIL(tile_bits);
+    tile_info->context_update_id = static_cast<int16_t>(scratch);
+    if (tile_info->context_update_id >= tile_info->tile_count) {
+      LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.",
+                   tile_info->context_update_id, tile_info->tile_count);
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(2);
+    tile_info->tile_size_bytes = 1 + scratch;
+  }
+  return true;
+}
+
+bool ObuParser::ReadAllowWarpedMotion() {
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode ||
+      !sequence_header_.enable_warped_motion) {
+    return true;
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.allow_warped_motion = scratch != 0;
+  return true;
+}
+
+bool ObuParser::ParseFrameParameters() {
+  int64_t scratch;
+  if (sequence_header_.reduced_still_picture_header) {
+    frame_header_.show_frame = true;
+    current_frame_ = buffer_pool_->GetFreeBuffer();
+    if (current_frame_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+      return false;
+    }
+  } else {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_existing_frame = scratch != 0;
+    if (frame_header_.show_existing_frame) {
+      OBU_READ_LITERAL_OR_FAIL(3);
+      frame_header_.frame_to_show = scratch;
+      if (sequence_header_.decoder_model_info_present_flag &&
+          !sequence_header_.timing_info.equal_picture_interval) {
+        OBU_READ_LITERAL_OR_FAIL(
+            sequence_header_.decoder_model_info.frame_presentation_time_length);
+        frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+        frame_header_.display_frame_id = static_cast<uint16_t>(scratch);
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever display_frame_id is read, the value matches
+        // RefFrameId[ frame_to_show_map_idx ] ..., and that
+        // RefValid[ frame_to_show_map_idx ] is equal to 1.
+        //
+        // The current_frame_ == nullptr check below is equivalent to checking
+        // if RefValid[ frame_to_show_map_idx ] is equal to 1.
+        if (frame_header_.display_frame_id !=
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       frame_header_.frame_to_show);
+          return false;
+        }
+      }
+      // Section 7.18.2. Note: This is also needed for Section 7.21 if
+      // frame_type is kFrameKey.
+      current_frame_ =
+          decoder_state_.reference_frame[frame_header_.frame_to_show];
+      if (current_frame_ == nullptr) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // when show_existing_frame is used to show a previous frame, that the
+      // value of showable_frame for the previous frame was equal to 1.
+      if (!current_frame_->showable_frame()) {
+        LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame",
+                     frame_header_.frame_to_show);
+        return false;
+      }
+      if (current_frame_->frame_type() == kFrameKey) {
+        frame_header_.refresh_frame_flags = 0xff;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // when show_existing_frame is used to show a previous frame with
+        // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that
+        // the frame is output via the show_existing_frame mechanism at most
+        // once.
+        current_frame_->set_showable_frame(false);
+
+        // Section 7.21. Note: decoder_state_.current_frame_id must be set
+        // only when frame_type is kFrameKey per the spec. Among all the
+        // variables set in Section 7.21, current_frame_id is the only one
+        // whose value lives across frames. (PrevFrameID is set equal to the
+        // current_frame_id value for the previous frame.)
+        decoder_state_.current_frame_id =
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show];
+        decoder_state_.order_hint =
+            decoder_state_.reference_order_hint[frame_header_.frame_to_show];
+      }
+      return true;
+    }
+    current_frame_ = buffer_pool_->GetFreeBuffer();
+    if (current_frame_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(2);
+    frame_header_.frame_type = static_cast<FrameType>(scratch);
+    current_frame_->set_frame_type(frame_header_.frame_type);
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.show_frame = scratch != 0;
+    if (frame_header_.show_frame &&
+        sequence_header_.decoder_model_info_present_flag &&
+        !sequence_header_.timing_info.equal_picture_interval) {
+      OBU_READ_LITERAL_OR_FAIL(
+          sequence_header_.decoder_model_info.frame_presentation_time_length);
+      frame_header_.frame_presentation_time = static_cast<uint32_t>(scratch);
+    }
+    if (frame_header_.show_frame) {
+      frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey);
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.showable_frame = scratch != 0;
+    }
+    current_frame_->set_showable_frame(frame_header_.showable_frame);
+    if (frame_header_.frame_type == kFrameSwitch ||
+        (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+      frame_header_.error_resilient_mode = true;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.error_resilient_mode = scratch != 0;
+    }
+  }
+  if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
+    decoder_state_.reference_order_hint.fill(0);
+    decoder_state_.reference_frame.fill(nullptr);
+  }
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.enable_cdf_update = scratch == 0;
+  if (sequence_header_.force_screen_content_tools ==
+      kSelectScreenContentTools) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.allow_screen_content_tools = scratch != 0;
+  } else {
+    frame_header_.allow_screen_content_tools =
+        sequence_header_.force_screen_content_tools != 0;
+  }
+  if (frame_header_.allow_screen_content_tools) {
+    if (sequence_header_.force_integer_mv == kSelectIntegerMv) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.force_integer_mv = scratch;
+    } else {
+      frame_header_.force_integer_mv = sequence_header_.force_integer_mv;
+    }
+  } else {
+    frame_header_.force_integer_mv = 0;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    frame_header_.force_integer_mv = 1;
+  }
+  if (sequence_header_.frame_id_numbers_present) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits);
+    frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
+    const int previous_frame_id = decoder_state_.current_frame_id;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+    if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+      if (previous_frame_id >= 0) {
+        // Section 6.8.2: ..., it is a requirement of bitstream conformance
+        // that all of the following conditions are true:
+        //   * current_frame_id is not equal to PrevFrameID,
+        //   * DiffFrameID is less than 1 << ( idLen - 1 )
+        int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        if (diff_frame_id <= 0) {
+          diff_frame_id += id_length_max_value;
+        }
+        if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+          LIBGAV1_DLOG(ERROR,
+                       "current_frame_id (%d) equals or differs too much from "
+                       "previous_frame_id (%d).",
+                       decoder_state_.current_frame_id, previous_frame_id);
+          return false;
+        }
+      }
+      MarkInvalidReferenceFrames();
+    }
+  } else {
+    frame_header_.current_frame_id = 0;
+    decoder_state_.current_frame_id = frame_header_.current_frame_id;
+  }
+  if (frame_header_.frame_type == kFrameSwitch) {
+    frame_header_.frame_size_override_flag = true;
+  } else if (!sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.frame_size_override_flag = scratch != 0;
+  }
+  if (sequence_header_.order_hint_bits > 0) {
+    OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits);
+    frame_header_.order_hint = scratch;
+  }
+  decoder_state_.order_hint = frame_header_.order_hint;
+  if (IsIntraFrame(frame_header_.frame_type) ||
+      frame_header_.error_resilient_mode) {
+    frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(3);
+    frame_header_.primary_reference_frame = scratch;
+  }
+  if (sequence_header_.decoder_model_info_present_flag) {
+    OBU_READ_BIT_OR_FAIL;
+    const bool buffer_removal_time_present = scratch != 0;
+    if (buffer_removal_time_present) {
+      for (int i = 0; i < sequence_header_.operating_points; ++i) {
+        if (!sequence_header_.decoder_model_present_for_operating_point[i]) {
+          continue;
+        }
+        const int index = sequence_header_.operating_point_idc[i];
+        if (index == 0 ||
+            (InTemporalLayer(index, obu_headers_.back().temporal_id) &&
+             InSpatialLayer(index, obu_headers_.back().spatial_id))) {
+          OBU_READ_LITERAL_OR_FAIL(
+              sequence_header_.decoder_model_info.buffer_removal_time_length);
+          frame_header_.buffer_removal_time[i] = static_cast<uint32_t>(scratch);
+        }
+      }
+    }
+  }
+  if (frame_header_.frame_type == kFrameSwitch ||
+      (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) {
+    frame_header_.refresh_frame_flags = 0xff;
+  } else {
+    OBU_READ_LITERAL_OR_FAIL(8);
+    frame_header_.refresh_frame_flags = scratch;
+    // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a
+    // requirement of bitstream conformance that refresh_frame_flags is not
+    // equal to 0xff.
+    if (frame_header_.frame_type == kFrameIntraOnly &&
+        frame_header_.refresh_frame_flags == 0xff) {
+      LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF.");
+      return false;
+    }
+  }
+  if ((!IsIntraFrame(frame_header_.frame_type) ||
+       frame_header_.refresh_frame_flags != 0xff) &&
+      !ParseReferenceOrderHint()) {
+    return false;
+  }
+  if (IsIntraFrame(frame_header_.frame_type)) {
+    if (!ParseFrameSizeAndRenderSize()) return false;
+    if (frame_header_.allow_screen_content_tools &&
+        frame_header_.width == frame_header_.upscaled_width) {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_intrabc = scratch != 0;
+    }
+  } else {
+    if (!sequence_header_.enable_order_hint) {
+      frame_header_.frame_refs_short_signaling = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.frame_refs_short_signaling = scratch != 0;
+      if (frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t last_frame_idx = scratch;
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const int8_t gold_frame_idx = scratch;
+        if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) {
+          return false;
+        }
+      }
+    }
+    for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+      if (!frame_header_.frame_refs_short_signaling) {
+        OBU_READ_LITERAL_OR_FAIL(3);
+        frame_header_.reference_frame_index[i] = scratch;
+      }
+      const int reference_frame_index = frame_header_.reference_frame_index[i];
+      assert(reference_frame_index >= 0);
+      // Section 6.8.2: It is a requirement of bitstream conformance that
+      // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ...
+      // The remainder of the statement is handled by ParseSequenceHeader().
+      // Note if support for Annex C: Error resilience behavior is added this
+      // check should be omitted per C.5 Decoder consequences of processable
+      // frames.
+      if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
+                     reference_frame_index);
+        return false;
+      }
+      if (sequence_header_.frame_id_numbers_present) {
+        OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits);
+        const int delta_frame_id = static_cast<int>(1 + scratch);
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        frame_header_.expected_frame_id[i] =
+            (frame_header_.current_frame_id + id_length_max_value -
+             delta_frame_id) %
+            id_length_max_value;
+        // Section 6.8.2: It is a requirement of bitstream conformance that
+        // whenever expectedFrameId[ i ] is calculated, the value matches
+        // RefFrameId[ ref_frame_idx[ i ] ] ...
+        if (frame_header_.expected_frame_id[i] !=
+            decoder_state_.reference_frame_id[reference_frame_index]) {
+          LIBGAV1_DLOG(ERROR,
+                       "Reference buffer %d has a frame id number mismatch.",
+                       reference_frame_index);
+          return false;
+        }
+      }
+    }
+    if (frame_header_.frame_size_override_flag &&
+        !frame_header_.error_resilient_mode) {
+      // Section 5.9.7.
+      for (int index : frame_header_.reference_frame_index) {
+        OBU_READ_BIT_OR_FAIL;
+        frame_header_.found_reference = scratch != 0;
+        if (frame_header_.found_reference) {
+          const RefCountedBuffer* reference_frame =
+              decoder_state_.reference_frame[index].get();
+          // frame_header_.upscaled_width will be set in the
+          // ParseSuperResParametersAndComputeImageSize() call below.
+          frame_header_.width = reference_frame->upscaled_width();
+          frame_header_.height = reference_frame->frame_height();
+          frame_header_.render_width = reference_frame->render_width();
+          frame_header_.render_height = reference_frame->render_height();
+          if (!ParseSuperResParametersAndComputeImageSize()) return false;
+          break;
+        }
+      }
+      if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) {
+        return false;
+      }
+    } else {
+      if (!ParseFrameSizeAndRenderSize()) return false;
+    }
+    if (!ValidateInterFrameSize()) return false;
+    if (frame_header_.force_integer_mv != 0) {
+      frame_header_.allow_high_precision_mv = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.allow_high_precision_mv = scratch != 0;
+    }
+    OBU_READ_BIT_OR_FAIL;
+    const bool is_filter_switchable = scratch != 0;
+    if (is_filter_switchable) {
+      frame_header_.interpolation_filter = kInterpolationFilterSwitchable;
+    } else {
+      OBU_READ_LITERAL_OR_FAIL(2);
+      frame_header_.interpolation_filter =
+          static_cast<InterpolationFilter>(scratch);
+    }
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.is_motion_mode_switchable = scratch != 0;
+    if (frame_header_.error_resilient_mode ||
+        !sequence_header_.enable_ref_frame_mvs) {
+      frame_header_.use_ref_frame_mvs = false;
+    } else {
+      OBU_READ_BIT_OR_FAIL;
+      frame_header_.use_ref_frame_mvs = scratch != 0;
+    }
+  }
+  // At this point, we have parsed the frame and render sizes and computed
+  // the image size, whether it's an intra or inter frame. So we can save
+  // the sizes in the current frame now.
+  if (!current_frame_->SetFrameDimensions(frame_header_)) {
+    LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed.");
+    return false;
+  }
+  if (!IsIntraFrame(frame_header_.frame_type)) {
+    // Initialize the kReferenceFrameIntra type reference frame information to
+    // simplify the frame type validation in motion field projection.
+    // Set the kReferenceFrameIntra type |order_hint_| to
+    // |frame_header_.order_hint|. This guarantees that in SIMD implementations,
+    // the other reference frame information of the kReferenceFrameIntra type
+    // could be correctly initialized using the following loop with
+    // |frame_header_.order_hint| being the |hint|.
+    ReferenceInfo* const reference_info = current_frame_->reference_info();
+    reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint;
+    reference_info->relative_distance_from[kReferenceFrameIntra] = 0;
+    reference_info->relative_distance_to[kReferenceFrameIntra] = 0;
+    reference_info->skip_references[kReferenceFrameIntra] = true;
+    reference_info->projection_divisions[kReferenceFrameIntra] = 0;
+
+    for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) {
+      const auto reference_frame = static_cast<ReferenceFrameType>(i);
+      const uint8_t hint =
+          decoder_state_.reference_order_hint
+              [frame_header_.reference_frame_index[i - kReferenceFrameLast]];
+      reference_info->order_hint[reference_frame] = hint;
+      const int relative_distance_from =
+          GetRelativeDistance(hint, frame_header_.order_hint,
+                              sequence_header_.order_hint_shift_bits);
+      const int relative_distance_to =
+          GetRelativeDistance(frame_header_.order_hint, hint,
+                              sequence_header_.order_hint_shift_bits);
+      reference_info->relative_distance_from[reference_frame] =
+          relative_distance_from;
+      reference_info->relative_distance_to[reference_frame] =
+          relative_distance_to;
+      reference_info->skip_references[reference_frame] =
+          relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0;
+      reference_info->projection_divisions[reference_frame] =
+          reference_info->skip_references[reference_frame]
+              ? 0
+              : kProjectionMvDivisionLookup[relative_distance_to];
+      decoder_state_.reference_frame_sign_bias[reference_frame] =
+          relative_distance_from > 0;
+    }
+  }
+  if (frame_header_.enable_cdf_update &&
+      !sequence_header_.reduced_still_picture_header) {
+    OBU_READ_BIT_OR_FAIL;
+    frame_header_.enable_frame_end_update_cdf = scratch == 0;
+  } else {
+    frame_header_.enable_frame_end_update_cdf = false;
+  }
+  return true;
+}
+
+bool ObuParser::ParseFrameHeader() {
+  // Section 6.8.1: It is a requirement of bitstream conformance that a
+  // sequence header OBU has been received before a frame header OBU.
+  if (!has_sequence_header_) return false;
+  if (!ParseFrameParameters()) return false;
+  if (frame_header_.show_existing_frame) return true;
+  assert(!obu_headers_.empty());
+  current_frame_->set_spatial_id(obu_headers_.back().spatial_id);
+  current_frame_->set_temporal_id(obu_headers_.back().temporal_id);
+  bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() &&
+                ParseSegmentationParameters();
+  if (!status) return false;
+  current_frame_->SetSegmentationParameters(frame_header_.segmentation);
+  status =
+      ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters();
+  if (!status) return false;
+  ComputeSegmentLosslessAndQIndex();
+  // Section 6.8.2: It is a requirement of bitstream conformance that
+  // delta_q_present is equal to 0 when CodedLossless is equal to 1.
+  if (frame_header_.coded_lossless && frame_header_.delta_q.present) {
+    return false;
+  }
+  status = ParseLoopFilterParameters();
+  if (!status) return false;
+  current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter);
+  status = ParseCdefParameters() && ParseLoopRestorationParameters() &&
+           ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() &&
+           ParseSkipModeParameters() && ReadAllowWarpedMotion();
+  if (!status) return false;
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  frame_header_.reduced_tx_set = scratch != 0;
+  status = ParseGlobalMotionParameters();
+  if (!status) return false;
+  current_frame_->SetGlobalMotions(frame_header_.global_motion);
+  status = ParseFilmGrainParameters();
+  if (!status) return false;
+  if (sequence_header_.film_grain_params_present) {
+    current_frame_->set_film_grain_params(frame_header_.film_grain_params);
+  }
+  return true;
+}
+
+bool ObuParser::ParsePadding(const uint8_t* data, size_t size) {
+  // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So
+  // check trailing bits only if |size| > 0.
+  if (size == 0) return true;
+  // The payload of a padding OBU is byte aligned. Therefore the first
+  // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+  const int i = GetLastNonzeroByteIndex(data, size);
+  if (i < 0) {
+    LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+    return false;
+  }
+  if (data[i] != 0x80) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "The last nonzero byte of the payload data is 0x%x, should be 0x80.",
+        data[i]);
+    return false;
+  }
+  // Skip all bits before the trailing bit.
+  bit_reader_->SkipBytes(i);
+  return true;
+}
+
+bool ObuParser::ParseMetadataScalability() {
+  int64_t scratch;
+  // scalability_mode_idc
+  OBU_READ_LITERAL_OR_FAIL(8);
+  const auto scalability_mode_idc = static_cast<int>(scratch);
+  if (scalability_mode_idc == kScalabilitySS) {
+    // Parse scalability_structure().
+    // spatial_layers_cnt_minus_1
+    OBU_READ_LITERAL_OR_FAIL(2);
+    const auto spatial_layers_count = static_cast<int>(scratch) + 1;
+    // spatial_layer_dimensions_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_dimensions_present_flag = scratch != 0;
+    // spatial_layer_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto spatial_layer_description_present_flag = scratch != 0;
+    // temporal_group_description_present_flag
+    OBU_READ_BIT_OR_FAIL;
+    const auto temporal_group_description_present_flag = scratch != 0;
+    // scalability_structure_reserved_3bits
+    OBU_READ_LITERAL_OR_FAIL(3);
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING,
+                   "scalability_structure_reserved_3bits is not zero.");
+    }
+    if (spatial_layer_dimensions_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_max_width[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+        // spatial_layer_max_height[i]
+        OBU_READ_LITERAL_OR_FAIL(16);
+      }
+    }
+    if (spatial_layer_description_present_flag) {
+      for (int i = 0; i < spatial_layers_count; ++i) {
+        // spatial_layer_ref_id[i]
+        OBU_READ_LITERAL_OR_FAIL(8);
+      }
+    }
+    if (temporal_group_description_present_flag) {
+      // temporal_group_size
+      OBU_READ_LITERAL_OR_FAIL(8);
+      const auto temporal_group_size = static_cast<int>(scratch);
+      for (int i = 0; i < temporal_group_size; ++i) {
+        // temporal_group_temporal_id[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        // temporal_group_temporal_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_spatial_switching_up_point_flag[i]
+        OBU_READ_BIT_OR_FAIL;
+        // temporal_group_ref_cnt[i]
+        OBU_READ_LITERAL_OR_FAIL(3);
+        const auto temporal_group_ref_count = static_cast<int>(scratch);
+        for (int j = 0; j < temporal_group_ref_count; ++j) {
+          // temporal_group_ref_pic_diff[i][j]
+          OBU_READ_LITERAL_OR_FAIL(8);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool ObuParser::ParseMetadataTimecode() {
+  int64_t scratch;
+  // counting_type: should be the same for all pictures in the coded video
+  // sequence. 7..31 are reserved.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  // full_timestamp_flag
+  OBU_READ_BIT_OR_FAIL;
+  const bool full_timestamp_flag = scratch != 0;
+  // discontinuity_flag
+  OBU_READ_BIT_OR_FAIL;
+  // cnt_dropped_flag
+  OBU_READ_BIT_OR_FAIL;
+  // n_frames
+  OBU_READ_LITERAL_OR_FAIL(9);
+  if (full_timestamp_flag) {
+    // seconds_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto seconds_value = static_cast<int>(scratch);
+    if (seconds_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+      return false;
+    }
+    // minutes_value
+    OBU_READ_LITERAL_OR_FAIL(6);
+    const auto minutes_value = static_cast<int>(scratch);
+    if (minutes_value > 59) {
+      LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+      return false;
+    }
+    // hours_value
+    OBU_READ_LITERAL_OR_FAIL(5);
+    const auto hours_value = static_cast<int>(scratch);
+    if (hours_value > 23) {
+      LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+      return false;
+    }
+  } else {
+    // seconds_flag
+    OBU_READ_BIT_OR_FAIL;
+    const bool seconds_flag = scratch != 0;
+    if (seconds_flag) {
+      // seconds_value
+      OBU_READ_LITERAL_OR_FAIL(6);
+      const auto seconds_value = static_cast<int>(scratch);
+      if (seconds_value > 59) {
+        LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value);
+        return false;
+      }
+      // minutes_flag
+      OBU_READ_BIT_OR_FAIL;
+      const bool minutes_flag = scratch != 0;
+      if (minutes_flag) {
+        // minutes_value
+        OBU_READ_LITERAL_OR_FAIL(6);
+        const auto minutes_value = static_cast<int>(scratch);
+        if (minutes_value > 59) {
+          LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value);
+          return false;
+        }
+        // hours_flag
+        OBU_READ_BIT_OR_FAIL;
+        const bool hours_flag = scratch != 0;
+        if (hours_flag) {
+          // hours_value
+          OBU_READ_LITERAL_OR_FAIL(5);
+          const auto hours_value = static_cast<int>(scratch);
+          if (hours_value > 23) {
+            LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value);
+            return false;
+          }
+        }
+      }
+    }
+  }
+  // time_offset_length: should be the same for all pictures in the coded
+  // video sequence.
+  OBU_READ_LITERAL_OR_FAIL(5);
+  const auto time_offset_length = static_cast<int>(scratch);
+  if (time_offset_length > 0) {
+    // time_offset_value
+    OBU_READ_LITERAL_OR_FAIL(time_offset_length);
+  }
+  // Compute clockTimestamp. Section 6.7.7:
+  //   When timing_info_present_flag is equal to 1 and discontinuity_flag is
+  //   equal to 0, the value of clockTimestamp shall be greater than or equal
+  //   to the value of clockTimestamp for the previous set of clock timestamp
+  //   syntax elements in output order.
+  return true;
+}
+
+bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) {
+  const size_t start_offset = bit_reader_->byte_offset();
+  size_t metadata_type;
+  if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) {
+    LIBGAV1_DLOG(ERROR, "Could not read metadata_type.");
+    return false;
+  }
+  const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset;
+  if (size < metadata_type_size) {
+    LIBGAV1_DLOG(
+        ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.",
+        metadata_type_size, size);
+    return false;
+  }
+  data += metadata_type_size;
+  size -= metadata_type_size;
+  int64_t scratch;
+  switch (metadata_type) {
+    case kMetadataTypeHdrContentLightLevel:
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.max_cll = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.max_fall = scratch;
+      break;
+    case kMetadataTypeHdrMasteringDisplayColorVolume:
+      for (int i = 0; i < 3; ++i) {
+        OBU_READ_LITERAL_OR_FAIL(16);
+        metadata_.primary_chromaticity_x[i] = scratch;
+        OBU_READ_LITERAL_OR_FAIL(16);
+        metadata_.primary_chromaticity_y[i] = scratch;
+      }
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.white_point_chromaticity_x = scratch;
+      OBU_READ_LITERAL_OR_FAIL(16);
+      metadata_.white_point_chromaticity_y = scratch;
+      OBU_READ_LITERAL_OR_FAIL(32);
+      metadata_.luminance_max = static_cast<uint32_t>(scratch);
+      OBU_READ_LITERAL_OR_FAIL(32);
+      metadata_.luminance_min = static_cast<uint32_t>(scratch);
+      break;
+    case kMetadataTypeScalability:
+      if (!ParseMetadataScalability()) return false;
+      break;
+    case kMetadataTypeItutT35: {
+      OBU_READ_LITERAL_OR_FAIL(8);
+      metadata_.itu_t_t35_country_code = static_cast<uint8_t>(scratch);
+      ++data;
+      --size;
+      if (metadata_.itu_t_t35_country_code == 0xFF) {
+        OBU_READ_LITERAL_OR_FAIL(8);
+        metadata_.itu_t_t35_country_code_extension_byte =
+            static_cast<uint8_t>(scratch);
+        ++data;
+        --size;
+      }
+      // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says:
+      //   itu_t_t35_payload_bytes shall be bytes containing data registered as
+      //   specified in Recommendation ITU-T T.35.
+      // Therefore itu_t_t35_payload_bytes is byte aligned and the first
+      // trailing byte should be 0x80. Since the exact syntax of
+      // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the
+      // end of itu_t_t35_payload_bytes by searching for the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i < 0) {
+        LIBGAV1_DLOG(ERROR, "Trailing bit is missing.");
+        return false;
+      }
+      if (data[i] != 0x80) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "itu_t_t35_payload_bytes is not byte aligned. The last nonzero "
+            "byte of the payload data is 0x%x, should be 0x80.",
+            data[i]);
+        return false;
+      }
+      if (i != 0) {
+        // data[0]..data[i - 1] are itu_t_t35_payload_bytes.
+        metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]);
+        if (metadata_.itu_t_t35_payload_bytes == nullptr) {
+          LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed.");
+          return false;
+        }
+        memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i);
+        metadata_.itu_t_t35_payload_size = i;
+      }
+      // Skip all bits before the trailing bit.
+      bit_reader_->SkipBytes(i);
+      break;
+    }
+    case kMetadataTypeTimecode:
+      if (!ParseMetadataTimecode()) return false;
+      break;
+    default: {
+      // metadata_type is equal to a value reserved for future use or a user
+      // private value.
+      //
+      // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU
+      // if they do not understand the metadata_type." Find the trailing bit
+      // and skip all bits before the trailing bit.
+      const int i = GetLastNonzeroByteIndex(data, size);
+      if (i >= 0) {
+        // The last 1 bit in the last nonzero byte is the trailing bit. Skip
+        // all bits before the trailing bit.
+        const int n = CountTrailingZeros(data[i]);
+        bit_reader_->SkipBits(i * 8 + 7 - n);
+      }
+      break;
+    }
+  }
+  return true;
+}
+
+bool ObuParser::AddTileBuffers(int start, int end, size_t total_size,
+                               size_t tg_header_size,
+                               size_t bytes_consumed_so_far) {
+  // Validate that the tile group start and end are within the allowed range.
+  if (start != next_tile_group_start_ || start > end ||
+      end >= frame_header_.tile_info.tile_count) {
+    LIBGAV1_DLOG(ERROR,
+                 "Invalid tile group start %d or end %d: expected tile group "
+                 "start %d, tile_count %d.",
+                 start, end, next_tile_group_start_,
+                 frame_header_.tile_info.tile_count);
+    return false;
+  }
+  next_tile_group_start_ = end + 1;
+
+  if (total_size < tg_header_size) {
+    LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)",
+                 total_size, tg_header_size);
+    return false;
+  }
+  size_t bytes_left = total_size - tg_header_size;
+  const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size;
+  for (int tile_number = start; tile_number <= end; ++tile_number) {
+    size_t tile_size = 0;
+    if (tile_number != end) {
+      RawBitReader bit_reader(data, bytes_left);
+      if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes,
+                                       &tile_size)) {
+        LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
+                     tile_number);
+        return false;
+      }
+      ++tile_size;
+      data += frame_header_.tile_info.tile_size_bytes;
+      bytes_left -= frame_header_.tile_info.tile_size_bytes;
+      if (tile_size > bytes_left) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    } else {
+      tile_size = bytes_left;
+      if (tile_size == 0) {
+        LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
+                     tile_number);
+        return false;
+      }
+    }
+    // The memory for this has been allocated in ParseTileInfoSyntax(). So it is
+    // safe to use push_back_unchecked here.
+    tile_buffers_.push_back_unchecked({data, tile_size});
+    data += tile_size;
+    bytes_left -= tile_size;
+  }
+  bit_reader_->SkipBytes(total_size - tg_header_size);
+  return true;
+}
+
+bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) {
+  const TileInfo* const tile_info = &frame_header_.tile_info;
+  const size_t start_offset = bit_reader_->byte_offset();
+  const int tile_bits =
+      tile_info->tile_columns_log2 + tile_info->tile_rows_log2;
+  if (tile_bits == 0) {
+    return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far);
+  }
+  int64_t scratch;
+  OBU_READ_BIT_OR_FAIL;
+  const bool tile_start_and_end_present_flag = scratch != 0;
+  if (!tile_start_and_end_present_flag) {
+    if (!bit_reader_->AlignToNextByte()) {
+      LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+      return false;
+    }
+    return AddTileBuffers(0, tile_info->tile_count - 1, size, 1,
+                          bytes_consumed_so_far);
+  }
+  if (obu_headers_.back().type == kObuFrame) {
+    // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of
+    // bitstream conformance that the value of tile_start_and_end_present_flag
+    // is equal to 0.
+    LIBGAV1_DLOG(ERROR,
+                 "tile_start_and_end_present_flag must be 0 in Frame OBU");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int start = static_cast<int>(scratch);
+  OBU_READ_LITERAL_OR_FAIL(tile_bits);
+  const int end = static_cast<int>(scratch);
+  if (!bit_reader_->AlignToNextByte()) {
+    LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+    return false;
+  }
+  const size_t tg_header_size = bit_reader_->byte_offset() - start_offset;
+  return AddTileBuffers(start, end, size, tg_header_size,
+                        bytes_consumed_so_far);
+}
+
+bool ObuParser::ParseHeader() {
+  ObuHeader obu_header;
+  int64_t scratch = bit_reader_->ReadBit();
+  if (scratch != 0) {
+    LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero.");
+    return false;
+  }
+  OBU_READ_LITERAL_OR_FAIL(4);
+  obu_header.type = static_cast<libgav1::ObuType>(scratch);
+  OBU_READ_BIT_OR_FAIL;
+  const bool extension_flag = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;
+  obu_header.has_size_field = scratch != 0;
+  OBU_READ_BIT_OR_FAIL;  // reserved.
+  if (scratch != 0) {
+    LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero.");
+  }
+  obu_header.has_extension = extension_flag;
+  if (extension_flag) {
+    if (extension_disallowed_) {
+      LIBGAV1_DLOG(ERROR,
+                   "OperatingPointIdc is 0, but obu_extension_flag is 1.");
+      return false;
+    }
+    OBU_READ_LITERAL_OR_FAIL(3);
+    obu_header.temporal_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(2);
+    obu_header.spatial_id = scratch;
+    OBU_READ_LITERAL_OR_FAIL(3);  // reserved.
+    if (scratch != 0) {
+      LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero.");
+    }
+  } else {
+    obu_header.temporal_id = 0;
+    obu_header.spatial_id = 0;
+  }
+  return obu_headers_.push_back(obu_header);
+}
+
+#undef OBU_READ_UVLC_OR_FAIL
+#undef OBU_READ_LITERAL_OR_FAIL
+#undef OBU_READ_BIT_OR_FAIL
+#undef OBU_PARSER_FAIL
+#undef OBU_LOG_AND_RETURN_FALSE
+
+bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) {
+  bit_reader_.reset(new (std::nothrow) RawBitReader(data, size));
+  return bit_reader_ != nullptr;
+}
+
+bool ObuParser::HasData() const { return size_ > 0; }
+
+StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) {
+  if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument;
+
+  assert(current_frame_ == nullptr);
+  // This is used to release any references held in case of parsing failure.
+  RefCountedBufferPtrCleanup current_frame_cleanup(&current_frame_);
+
+  const uint8_t* data = data_;
+  size_t size = size_;
+
+  // Clear everything except the sequence header.
+  obu_headers_.clear();
+  frame_header_ = {};
+  metadata_ = {};
+  tile_buffers_.clear();
+  next_tile_group_start_ = 0;
+  sequence_header_changed_ = false;
+
+  bool parsed_one_full_frame = false;
+  bool seen_frame_header = false;
+  const uint8_t* frame_header = nullptr;
+  size_t frame_header_size_in_bits = 0;
+  while (size > 0 && !parsed_one_full_frame) {
+    if (!InitBitReader(data, size)) {
+      LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader.");
+      return kStatusOutOfMemory;
+    }
+    if (!ParseHeader()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header.");
+      return kStatusBitstreamError;
+    }
+    const ObuHeader& obu_header = obu_headers_.back();
+    if (!obu_header.has_size_field) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "has_size_field is zero. libgav1 does not support such streams.");
+      return kStatusUnimplemented;
+    }
+    const size_t obu_header_size = bit_reader_->byte_offset();
+    size_t obu_size;
+    if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) {
+      LIBGAV1_DLOG(ERROR, "Could not read OBU size.");
+      return kStatusBitstreamError;
+    }
+    const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size;
+    if (size - bit_reader_->byte_offset() < obu_size) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.",
+                   size - bit_reader_->bit_offset(), obu_size);
+      return kStatusBitstreamError;
+    }
+
+    const ObuType obu_type = obu_header.type;
+    if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter &&
+        has_sequence_header_ &&
+        sequence_header_.operating_point_idc[operating_point_] != 0 &&
+        obu_header.has_extension &&
+        (!InTemporalLayer(
+             sequence_header_.operating_point_idc[operating_point_],
+             obu_header.temporal_id) ||
+         !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_],
+                         obu_header.spatial_id))) {
+      obu_headers_.pop_back();
+      bit_reader_->SkipBytes(obu_size);
+      data += bit_reader_->byte_offset();
+      size -= bit_reader_->byte_offset();
+      continue;
+    }
+
+    const size_t obu_start_position = bit_reader_->bit_offset();
+    // The bit_reader_ is byte aligned after reading obu_header and obu_size.
+    // Therefore the byte offset can be computed as obu_start_position >> 3
+    // below.
+    assert((obu_start_position & 7) == 0);
+    bool obu_skipped = false;
+    switch (obu_type) {
+      case kObuTemporalDelimiter:
+        break;
+      case kObuSequenceHeader:
+        if (!ParseSequenceHeader(seen_frame_header)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) {
+          LIBGAV1_DLOG(
+              ERROR,
+              "Bitdepth %d is not supported. The maximum bitdepth is %d.",
+              sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH);
+          return kStatusUnimplemented;
+        }
+        break;
+      case kObuFrameHeader:
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU.");
+          return kStatusBitstreamError;
+        }
+        frame_header = &data[obu_start_position >> 3];
+        frame_header_size_in_bits =
+            bit_reader_->bit_offset() - obu_start_position;
+        seen_frame_header = true;
+        parsed_one_full_frame = frame_header_.show_existing_frame;
+        break;
+      case kObuRedundantFrameHeader: {
+        if (!seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header found but frame header was not "
+                       "yet seen.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = (frame_header_size_in_bits + 7) >> 3;
+        if (obu_size < fh_size ||
+            memcmp(frame_header, &data[obu_start_position >> 3], fh_size) !=
+                0) {
+          LIBGAV1_DLOG(ERROR,
+                       "Redundant frame header differs from frame header.");
+          return kStatusBitstreamError;
+        }
+        bit_reader_->SkipBits(frame_header_size_in_bits);
+        break;
+      }
+      case kObuFrame: {
+        const size_t fh_start_offset = bit_reader_->byte_offset();
+        if (seen_frame_header) {
+          LIBGAV1_DLOG(ERROR,
+                       "Frame header found but frame header was already seen.");
+          return kStatusBitstreamError;
+        }
+        if (!ParseFrameHeader()) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a
+        // requirement of bitstream conformance that show_existing_frame is
+        // equal to 0.
+        if (frame_header_.show_existing_frame) {
+          LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1.");
+          return kStatusBitstreamError;
+        }
+        if (!bit_reader_->AlignToNextByte()) {
+          LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits.");
+          return kStatusBitstreamError;
+        }
+        const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset;
+        if (fh_size >= obu_size) {
+          LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).",
+                       fh_size, obu_size);
+          return kStatusBitstreamError;
+        }
+        if (!ParseTileGroup(obu_size - fh_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame = true;
+        break;
+      }
+      case kObuTileGroup:
+        if (!ParseTileGroup(obu_size,
+                            size_ - size + bit_reader_->byte_offset())) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU.");
+          return kStatusBitstreamError;
+        }
+        parsed_one_full_frame =
+            (next_tile_group_start_ == frame_header_.tile_info.tile_count);
+        break;
+      case kObuTileList:
+        LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported.");
+        return kStatusUnimplemented;
+      case kObuPadding:
+        if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      case kObuMetadata:
+        if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) {
+          LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU.");
+          return kStatusBitstreamError;
+        }
+        break;
+      default:
+        // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use
+        // and shall be ignored by AV1 decoder.
+        bit_reader_->SkipBytes(obu_size);
+        obu_skipped = true;
+        break;
+    }
+    if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame &&
+        obu_type != kObuTileGroup) {
+      const size_t parsed_obu_size_in_bits =
+          bit_reader_->bit_offset() - obu_start_position;
+      if (obu_size * 8 < parsed_obu_size_in_bits) {
+        LIBGAV1_DLOG(
+            ERROR,
+            "Parsed OBU size (%zu bits) is greater than expected OBU size "
+            "(%zu bytes) obu_type: %d.",
+            parsed_obu_size_in_bits, obu_size, obu_type);
+        return kStatusBitstreamError;
+      }
+      if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 -
+                                                  parsed_obu_size_in_bits)) {
+        LIBGAV1_DLOG(ERROR,
+                     "Error when verifying trailing bits for obu type: %d",
+                     obu_type);
+        return kStatusBitstreamError;
+      }
+    }
+    const size_t bytes_consumed = bit_reader_->byte_offset();
+    const size_t consumed_obu_size =
+        bytes_consumed - obu_length_size - obu_header_size;
+    if (consumed_obu_size != obu_size) {
+      LIBGAV1_DLOG(ERROR,
+                   "OBU size (%zu) and consumed size (%zu) does not match for "
+                   "obu_type: %d.",
+                   obu_size, consumed_obu_size, obu_type);
+      return kStatusBitstreamError;
+    }
+    data += bytes_consumed;
+    size -= bytes_consumed;
+  }
+  if (!parsed_one_full_frame && seen_frame_header) {
+    LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received.");
+    return kStatusBitstreamError;
+  }
+  data_ = data;
+  size_ = size;
+  *current_frame = std::move(current_frame_);
+  return kStatusOk;
+}
+
+}  // namespace libgav1
diff --git a/src/obu_parser.h b/src/obu_parser.h
new file mode 100644
index 0000000..3f452ef
--- /dev/null
+++ b/src/obu_parser.h
@@ -0,0 +1,413 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_OBU_PARSER_H_
+#define LIBGAV1_SRC_OBU_PARSER_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/quantizer.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/raw_bit_reader.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+
+// structs and enums related to Open Bitstream Units (OBU).
+
+enum {
+  kMinimumMajorBitstreamLevel = 2,
+  kSelectScreenContentTools = 2,
+  kSelectIntegerMv = 2,
+  kLoopRestorationTileSizeMax = 256,
+  kGlobalMotionAlphaBits = 12,
+  kGlobalMotionTranslationBits = 12,
+  kGlobalMotionTranslationOnlyBits = 9,
+  kGlobalMotionAlphaPrecisionBits = 15,
+  kGlobalMotionTranslationPrecisionBits = 6,
+  kGlobalMotionTranslationOnlyPrecisionBits = 3,
+  kMaxTileWidth = 4096,
+  kMaxTileArea = 4096 * 2304,
+  kPrimaryReferenceNone = 7,
+  // A special value of the scalability_mode_idc syntax element that indicates
+  // the picture prediction structure is specified in scalability_structure().
+  kScalabilitySS = 14
+};  // anonymous enum
+
+struct ObuHeader {
+  ObuType type;
+  bool has_extension;
+  bool has_size_field;
+  int8_t temporal_id;
+  int8_t spatial_id;
+};
+
+enum BitstreamProfile : uint8_t {
+  kProfile0,
+  kProfile1,
+  kProfile2,
+  kMaxProfiles
+};
+
+// In the bitstream the level is encoded in five bits: the first three bits
+// encode |major| - 2 and the last two bits encode |minor|.
+//
+// If the mapped level (major.minor) is in the tables in Annex A.3, there are
+// bitstream conformance requirements on the maximum or minimum values of
+// several variables. The encoded value of 31 (which corresponds to the mapped
+// level 9.3) is the "maximum parameters" level and imposes no level-based
+// constraints on the bitstream.
+struct BitStreamLevel {
+  uint8_t major;  // Range: 2-9.
+  uint8_t minor;  // Range: 0-3.
+};
+
+struct ColorConfig {
+  int8_t bitdepth;
+  bool is_monochrome;
+  ColorPrimary color_primary;
+  TransferCharacteristics transfer_characteristics;
+  MatrixCoefficients matrix_coefficients;
+  // A binary value (0 or 1) that is associated with the VideoFullRangeFlag
+  // variable specified in ISO/IEC 23091-4/ITUT H.273.
+  // * 0: the studio swing representation.
+  // * 1: the full swing representation.
+  ColorRange color_range;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+  ChromaSamplePosition chroma_sample_position;
+  bool separate_uv_delta_q;
+};
+
+struct TimingInfo {
+  uint32_t num_units_in_tick;
+  uint32_t time_scale;
+  bool equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+};
+
+struct DecoderModelInfo {
+  uint8_t encoder_decoder_buffer_delay_length;
+  uint32_t num_units_in_decoding_tick;
+  uint8_t buffer_removal_time_length;
+  uint8_t frame_presentation_time_length;
+};
+
+struct OperatingParameters {
+  uint32_t decoder_buffer_delay[kMaxOperatingPoints];
+  uint32_t encoder_buffer_delay[kMaxOperatingPoints];
+  bool low_delay_mode_flag[kMaxOperatingPoints];
+};
+
+struct ObuSequenceHeader {
+  // Section 7.5:
+  //   Within a particular coded video sequence, the contents of
+  //   sequence_header_obu must be bit-identical each time the sequence header
+  //   appears except for the contents of operating_parameters_info. A new
+  //   coded video sequence is required if the sequence header parameters
+  //   change.
+  //
+  // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For
+  // this to work, this object and the |old| object must be initialized with
+  // an empty brace-enclosed list, which initializes any padding to zero bits.
+  // See https://en.cppreference.com/w/cpp/language/zero_initialization.
+  bool ParametersChanged(const ObuSequenceHeader& old) const;
+
+  BitstreamProfile profile;
+  bool still_picture;
+  bool reduced_still_picture_header;
+  int operating_points;
+  int operating_point_idc[kMaxOperatingPoints];
+  BitStreamLevel level[kMaxOperatingPoints];
+  int8_t tier[kMaxOperatingPoints];
+  int8_t frame_width_bits;
+  int8_t frame_height_bits;
+  int32_t max_frame_width;
+  int32_t max_frame_height;
+  bool frame_id_numbers_present;
+  int8_t frame_id_length_bits;
+  int8_t delta_frame_id_length_bits;
+  bool use_128x128_superblock;
+  bool enable_filter_intra;
+  bool enable_intra_edge_filter;
+  bool enable_interintra_compound;
+  bool enable_masked_compound;
+  bool enable_warped_motion;
+  bool enable_dual_filter;
+  bool enable_order_hint;
+  // If enable_order_hint is true, order_hint_bits is in the range [1, 8].
+  // If enable_order_hint is false, order_hint_bits is 0.
+  int8_t order_hint_bits;
+  // order_hint_shift_bits equals (32 - order_hint_bits) % 32.
+  // This is used frequently in GetRelativeDistance().
+  uint8_t order_hint_shift_bits;
+  bool enable_jnt_comp;
+  bool enable_ref_frame_mvs;
+  bool choose_screen_content_tools;
+  int8_t force_screen_content_tools;
+  bool choose_integer_mv;
+  int8_t force_integer_mv;
+  bool enable_superres;
+  bool enable_cdef;
+  bool enable_restoration;
+  ColorConfig color_config;
+  bool timing_info_present_flag;
+  TimingInfo timing_info;
+  bool decoder_model_info_present_flag;
+  DecoderModelInfo decoder_model_info;
+  bool decoder_model_present_for_operating_point[kMaxOperatingPoints];
+  bool initial_display_delay_present_flag;
+  uint8_t initial_display_delay[kMaxOperatingPoints];
+  bool film_grain_params_present;
+
+  // IMPORTANT: the operating_parameters member must be at the end of the
+  // struct so that ParametersChanged() can be implemented with a memcmp()
+  // call.
+  OperatingParameters operating_parameters;
+};
+// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp
+// to compare two ObuSequenceHeader objects.
+static_assert(std::is_standard_layout<ObuSequenceHeader>::value, "");
+// Verify operating_parameters is the last member of ObuSequenceHeader. The
+// second assertion assumes that ObuSequenceHeader has no padding after the
+// operating_parameters field. The first assertion is a sufficient condition
+// for ObuSequenceHeader to have no padding after the operating_parameters
+// field.
+static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), "");
+static_assert(sizeof(ObuSequenceHeader) ==
+                  offsetof(ObuSequenceHeader, operating_parameters) +
+                      sizeof(OperatingParameters),
+              "");
+
+struct TileBuffer {
+  const uint8_t* data;
+  size_t size;
+};
+
+enum MetadataType : uint8_t {
+  // 0 is reserved for AOM use.
+  kMetadataTypeHdrContentLightLevel = 1,
+  kMetadataTypeHdrMasteringDisplayColorVolume = 2,
+  kMetadataTypeScalability = 3,
+  kMetadataTypeItutT35 = 4,
+  kMetadataTypeTimecode = 5,
+  // 6-31 are unregistered user private.
+  // 32 and greater are reserved for AOM use.
+};
+
+struct ObuMetadata {
+  // Maximum content light level.
+  uint16_t max_cll;
+  // Maximum frame-average light level.
+  uint16_t max_fall;
+  uint16_t primary_chromaticity_x[3];
+  uint16_t primary_chromaticity_y[3];
+  uint16_t white_point_chromaticity_x;
+  uint16_t white_point_chromaticity_y;
+  uint32_t luminance_max;
+  uint32_t luminance_min;
+  // ITU-T T.35.
+  uint8_t itu_t_t35_country_code;
+  uint8_t itu_t_t35_country_code_extension_byte;  // Valid if
+                                                  // itu_t_t35_country_code is
+                                                  // 0xFF.
+  std::unique_ptr<uint8_t[]> itu_t_t35_payload_bytes;
+  size_t itu_t_t35_payload_size;
+};
+
+class ObuParser : public Allocable {
+ public:
+  ObuParser(const uint8_t* const data, size_t size, int operating_point,
+            BufferPool* const buffer_pool, DecoderState* const decoder_state)
+      : data_(data),
+        size_(size),
+        operating_point_(operating_point),
+        buffer_pool_(buffer_pool),
+        decoder_state_(*decoder_state) {}
+
+  // Not copyable or movable.
+  ObuParser(const ObuParser& rhs) = delete;
+  ObuParser& operator=(const ObuParser& rhs) = delete;
+
+  // Returns true if there is more data that needs to be parsed.
+  bool HasData() const;
+
+  // Parses a sequence of Open Bitstream Units until a decodable frame is found
+  // (or until the end of stream is reached). A decodable frame is considered to
+  // be found when one of the following happens:
+  //   * A kObuFrame is seen.
+  //   * The kObuTileGroup containing the last tile is seen.
+  //   * A kFrameHeader with show_existing_frame = true is seen.
+  //
+  // If the parsing is successful, relevant fields will be populated. The fields
+  // are valid only if the return value is kStatusOk. Returns kStatusOk on
+  // success, an error status otherwise. On success, |current_frame| will be
+  // populated with a valid frame buffer.
+  StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame);
+
+  // Getters. Only valid if ParseOneFrame() completes successfully.
+  const Vector<ObuHeader>& obu_headers() const { return obu_headers_; }
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
+  const ObuMetadata& metadata() const { return metadata_; }
+  // Returns true if the last call to ParseOneFrame() encountered a sequence
+  // header change.
+  bool sequence_header_changed() const { return sequence_header_changed_; }
+
+  // Setters.
+  void set_sequence_header(const ObuSequenceHeader& sequence_header) {
+    sequence_header_ = sequence_header;
+    has_sequence_header_ = true;
+  }
+
+  // Moves |tile_buffers_| into |tile_buffers|.
+  void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
+    *tile_buffers = std::move(tile_buffers_);
+  }
+
+ private:
+  // Initializes the bit reader. This is a function of its own to make unit
+  // testing of private functions simpler.
+  LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size);
+
+  // Parse helper functions.
+  bool ParseHeader();  // 5.3.2 and 5.3.3.
+  bool ParseColorConfig(ObuSequenceHeader* sequence_header);       // 5.5.2.
+  bool ParseTimingInfo(ObuSequenceHeader* sequence_header);        // 5.5.3.
+  bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header);  // 5.5.4.
+  bool ParseOperatingParameters(ObuSequenceHeader* sequence_header,
+                                int index);          // 5.5.5.
+  bool ParseSequenceHeader(bool seen_frame_header);  // 5.5.1.
+  bool ParseFrameParameters();                       // 5.9.2, 5.9.7 and 5.9.10.
+  void MarkInvalidReferenceFrames();                 // 5.9.4.
+  bool ParseFrameSizeAndRenderSize();                // 5.9.5 and 5.9.6.
+  bool ParseSuperResParametersAndComputeImageSize();  // 5.9.8 and 5.9.9.
+  // Checks the bitstream conformance requirement in Section 6.8.6.
+  bool ValidateInterFrameSize() const;
+  bool ParseReferenceOrderHint();
+  static int FindLatestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindEarliestBackwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindLatestForwardReference(
+      const int current_frame_hint,
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints,
+      const std::array<bool, kNumReferenceFrameTypes>& used_frame);
+  static int FindReferenceWithSmallestOutputOrder(
+      const std::array<int, kNumReferenceFrameTypes>& shifted_order_hints);
+  bool SetFrameReferences(int8_t last_frame_idx,
+                          int8_t gold_frame_idx);  // 7.8.
+  bool ParseLoopFilterParameters();                // 5.9.11.
+  bool ParseDeltaQuantizer(int8_t* delta);         // 5.9.13.
+  bool ParseQuantizerParameters();                 // 5.9.12.
+  bool ParseSegmentationParameters();              // 5.9.14.
+  bool ParseQuantizerIndexDeltaParameters();       // 5.9.17.
+  bool ParseLoopFilterDeltaParameters();           // 5.9.18.
+  void ComputeSegmentLosslessAndQIndex();
+  bool ParseCdefParameters();             // 5.9.19.
+  bool ParseLoopRestorationParameters();  // 5.9.20.
+  bool ParseTxModeSyntax();               // 5.9.21.
+  bool ParseFrameReferenceModeSyntax();   // 5.9.23.
+  // Returns whether skip mode is allowed. When it returns true, it also sets
+  // the frame_header_.skip_mode_frame array.
+  bool IsSkipModeAllowed();
+  bool ParseSkipModeParameters();  // 5.9.22.
+  bool ReadAllowWarpedMotion();
+  bool ParseGlobalParamSyntax(
+      int ref, int index,
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>&
+          prev_global_motions);        // 5.9.25.
+  bool ParseGlobalMotionParameters();  // 5.9.24.
+  bool ParseFilmGrainParameters();     // 5.9.30.
+  bool ParseTileInfoSyntax();          // 5.9.15.
+  bool ParseFrameHeader();             // 5.9.
+  // |data| and |size| specify the payload data of the padding OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParsePadding() as function parameters so that
+  // ParsePadding() can find the trailing bit of the OBU and skip over the
+  // payload data as an opaque chunk of data.
+  bool ParsePadding(const uint8_t* data, size_t size);  // 5.7.
+  bool ParseMetadataScalability();                      // 5.8.5 and 5.8.6.
+  bool ParseMetadataTimecode();                         // 5.8.7.
+  // |data| and |size| specify the payload data of the metadata OBU.
+  // NOTE: Although the payload data is available in the bit_reader_ member,
+  // it is also passed to ParseMetadata() as function parameters so that
+  // ParseMetadata() can find the trailing bit of the OBU and either extract
+  // or skip over the payload data as an opaque chunk of data.
+  bool ParseMetadata(const uint8_t* data, size_t size);  // 5.8.
+  // Adds and populates the TileBuffer for each tile in the tile group and
+  // updates |next_tile_group_start_|
+  bool AddTileBuffers(int start, int end, size_t total_size,
+                      size_t tg_header_size, size_t bytes_consumed_so_far);
+  bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far);  // 5.11.1.
+
+  // Parser elements.
+  std::unique_ptr<RawBitReader> bit_reader_;
+  const uint8_t* data_;
+  size_t size_;
+  const int operating_point_;
+
+  // OBU elements. Only valid if ParseOneFrame() completes successfully.
+  Vector<ObuHeader> obu_headers_;
+  ObuSequenceHeader sequence_header_ = {};
+  ObuFrameHeader frame_header_ = {};
+  Vector<TileBuffer> tile_buffers_;
+  ObuMetadata metadata_ = {};
+  // The expected starting tile number of the next Tile Group.
+  int next_tile_group_start_ = 0;
+  // If true, the sequence_header_ field is valid.
+  bool has_sequence_header_ = false;
+  // If true, it means that the last call to ParseOneFrame() encountered a
+  // sequence header change.
+  bool sequence_header_changed_ = false;
+  // If true, the obu_extension_flag syntax element in the OBU header must be
+  // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
+  bool extension_disallowed_ = false;
+
+  BufferPool* const buffer_pool_;
+  DecoderState& decoder_state_;
+  // Used by ParseOneFrame() to populate the current frame that is being
+  // decoded. The invariant maintained is that this variable will be nullptr at
+  // the beginning and at the end of each call to ParseOneFrame(). This ensures
+  // that the ObuParser is not holding on to any references to the current
+  // frame once the ParseOneFrame() call is complete.
+  RefCountedBufferPtr current_frame_;
+
+  // For unit testing private functions.
+  friend class ObuParserTest;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_OBU_PARSER_H_
diff --git a/src/obu_parser_test.cc b/src/obu_parser_test.cc
new file mode 100644
index 0000000..6397ad0
--- /dev/null
+++ b/src/obu_parser_test.cc
@@ -0,0 +1,2675 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/obu_parser.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/status_code.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+#include "src/utils/vector.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+// Note the following test classes access private functions/members of
+// ObuParser. To be declared friends of ObuParser they must not have internal
+// linkage (they must be outside the anonymous namespace).
+namespace libgav1 {
+
+// Helper class to manipulate individual bits and generate a byte string.
+class BytesAndBits {
+ public:
+  // Append a bit to the end.
+  void AppendBit(uint8_t bit) { bits_.push_back(bit != 0); }
+
+  // Append a byte to the end.
+  void AppendByte(uint8_t byte) {
+    for (int i = 0; i < 8; ++i) {
+      AppendBit(GetNthBit(byte, i, 8));
+    }
+  }
+
+  // Append a literal of size |bits| to the end.
+  void AppendLiteral(int bits, int value) {
+    InsertLiteral(static_cast<int>(bits_.size()), bits, value);
+  }
+
+  // Append an inverse signed literal to the end. |bits + 1| bits are appended.
+  void AppendInverseSignedLiteral(int bits, int value) {
+    InsertInverseSignedLiteral(static_cast<int>(bits_.size()), bits, value);
+  }
+
+  // Append a sequence of bytes to the end.
+  void AppendBytes(const std::vector<uint8_t>& bytes) {
+    for (const auto& byte : bytes) {
+      AppendByte(byte);
+    }
+  }
+
+  // Insert |bit| in |offset|. Moves all other bits to the right by 1.
+  void InsertBit(int offset, uint8_t bit) {
+    auto iterator = bits_.begin();
+    bits_.insert(iterator + offset, bit != 0);
+  }
+
+  // Insert |value| of size |bits| at offset |offset|. Moves all other bits to
+  // the right by |bits|.
+  void InsertLiteral(int offset, int bits, int value) {
+    for (int i = 0; i < bits; ++i) {
+      InsertBit(i + offset, GetNthBit(value, i, bits));
+    }
+  }
+
+  // Insert |value| of size |bits| at offset |offset| as an inverse signed
+  // literal. Move all other bits to the right by |bits + 1|.
+  //
+  // Note: This is denoted su(1+bits) in the spec.
+  void InsertInverseSignedLiteral(int offset, int bits, int value) {
+    InsertBit(offset, (value >= 0) ? 0 : 1);
+    InsertLiteral(offset + 1, bits, value);
+  }
+
+  // Insert |value| at |offset| as an unsigned variable length code (uvlc).
+  // Return the number of bits inserted.
+  int InsertUvlc(int offset, int value) {
+    int leading_zeros = 1;
+    int shift_value = ++value;
+    while ((shift_value >>= 1) != 0) leading_zeros += 2;
+    int bits = 0;
+    InsertLiteral(offset, leading_zeros >> 1, 0);
+    bits += leading_zeros >> 1;
+    InsertLiteral(offset + bits, (leading_zeros + 1) >> 1, value);
+    bits += (leading_zeros + 1) >> 1;
+    return bits;
+  }
+
+  // Set the bit at |offset| to |bit|. The bit should already exist.
+  void SetBit(int offset, uint8_t bit) { bits_[offset] = bit != 0; }
+
+  // Set |bits| starting at |offset| to |value|. The bits should already exist.
+  void SetLiteral(int offset, int bits, int value) {
+    for (int i = 0; i < bits; ++i) {
+      SetBit(offset + i, GetNthBit(value, i, bits));
+    }
+  }
+
+  // Remove a bit in |offset|. Moves over all the following bits to the left by
+  // 1.
+  void RemoveBit(int offset) { RemoveLiteral(offset, 1); }
+
+  // Remove a literal of size |bits| from |offset|. Moves over all the
+  // following bits to the left by |bits|.
+  void RemoveLiteral(int offset, int bits) {
+    bits_.erase(bits_.begin() + offset, bits_.begin() + offset + bits);
+  }
+
+  // Remove all bits after offset.
+  void RemoveAllBitsAfter(int offset) {
+    RemoveLiteral(offset, static_cast<int>(bits_.size()) - offset);
+  }
+
+  // Clear all the bits stored.
+  void Clear() { bits_.clear(); }
+
+  // Generate the data vector from the bits. Pads 0 to the end of the last byte
+  // if necessary.
+  const std::vector<uint8_t>& GenerateData() {
+    data_.clear();
+    for (size_t i = 0; i < bits_.size(); i += 8) {
+      uint8_t byte = 0;
+      for (int j = 0; j < 8; ++j) {
+        const uint8_t bit =
+            ((i + j) < bits_.size()) ? static_cast<uint8_t>(bits_[i + j]) : 0;
+        byte |= bit << (7 - j);
+      }
+      data_.push_back(byte);
+    }
+    return data_;
+  }
+
+ private:
+  // Get the |n|th MSB from |value| with the assumption that |value| has |size|
+  // bits.
+  static uint8_t GetNthBit(int value, int n, int size) {
+    return (value >> (size - n - 1)) & 0x01;
+  }
+
+  std::vector<uint8_t> data_;
+  std::vector<bool> bits_;
+};
+
+class ObuParserTest : public testing::Test {
+ protected:
+  // Constants for unit tests.
+  static constexpr int kFrameWidthBits = 9;
+  static constexpr int kFrameHeightBits = 8;
+  static constexpr int kHeight = 240;
+  static constexpr int kWidth = 426;
+  static constexpr int kRows4x4 = 60;
+  static constexpr int kColumns4x4 = 108;
+  static constexpr int kFrameToShow = 2;
+  static constexpr int kDisplayFrameId = 10;
+  static constexpr int kFrameIdLengthBits = 15;
+  static constexpr int kDeltaFrameIdLengthBits = 14;
+
+  // Bit streams for testing. These may contain trailing bits and tests may have
+  // to remove some of the trailing bits to keep the boundary alignment.
+  const std::vector<uint8_t> kDefaultTemporalDelimiter = {0x12, 0x00};
+  // Bits  Syntax element                  Value
+  // 1     obu_forbidden_bit               0
+  // 4     obu_type                        2 (OBU_TEMPORAL_DELIMITER)
+  // 1     obu_extension_flag              1
+  // 1     obu_has_size_field              1
+  // 1     obu_reserved_1bit               0
+  // 3     temporal_id                     6
+  // 2     spatial_id                      2
+  // 3     extension_header_reserved_3bits 0
+  // 8     obu_size                        0
+  const std::vector<uint8_t> kDefaultTemporalDelimiterWithExtension = {
+      0x16, 0xd0, 0x00};
+  const std::vector<uint8_t> kDefaultHeaderWithoutSizeField = {0x10};
+  // Offset  Bits  Syntax element                     Value
+  // 0       3     seq_profile                        0
+  // 3       1     still_picture                      0
+  // 4       1     reduced_still_picture_header       0
+  // 5       1     timing_info_present_flag           0
+  // 6       1     initial_display_delay_present_flag 0
+  // 7       5     operating_points_cnt_minus_1       0
+  // 12      12    operating_point_idc[ 0 ]           0
+  // 24      5     seq_level_idx[ 0 ]                 0
+  // 29      4     frame_width_bits_minus_1           8
+  // 33      4     frame_height_bits_minus_1          7
+  // 37      9     max_frame_width_minus_1            425
+  // 46      8     max_frame_height_minus_1           239
+  // 54      1     frame_id_numbers_present_flag      0
+  // 55      1     use_128x128_superblock             1
+  // 56      1     enable_filter_intra                1
+  // 57      1     enable_intra_edge_filter           1
+  // 58      1     enable_interintra_compound         1
+  // 59      1     enable_masked_compound             1
+  // 60      1     enable_warped_motion               0
+  // 61      1     enable_dual_filter                 1
+  // 62      1     enable_order_hint                  1
+  // 63      1     enable_jnt_comp                    1
+  // 64      1     enable_ref_frame_mvs               1
+  // 65      1     seq_choose_screen_content_tools    1
+  // 66      1     seq_choose_integer_mv              1
+  // 67      3     order_hint_bits_minus_1            6
+  // 70      1     enable_superres                    0
+  // 71      1     enable_cdef                        1
+  // 72      1     enable_restoration                 1
+  // ...
+  const std::vector<uint8_t> kDefaultSequenceHeader = {
+      0x00, 0x00, 0x00, 0x04, 0x3e, 0xa7, 0xbd, 0xf7, 0xf9, 0x80, 0x40};
+  const std::vector<uint8_t> kDefaultFrameHeaderKeyFrame = {0x10, 0x00};
+  // Bits  Syntax element           Value
+  // 1     show_existing_frame      0
+  // 2     frame_type               2 (kFrameIntraOnly)
+  // 1     show_frame               1
+  // 1     error_resilient_mode     0
+  // 1     disable_cdf_update       0
+  // 1     frame_size_override_flag 0
+  // 8     refresh_frame_flags      4
+  // ...
+  const std::vector<uint8_t> kDefaultFrameHeaderIntraOnlyFrame = {0x50, 0x08,
+                                                                  0x00};
+  // Bits  Syntax element           Value
+  // 1     show_existing_frame      0
+  // 2     frame_type               1 (kFrameInter)
+  // 1     show_frame               1
+  // 1     error_resilient_mode     0
+  // 1     disable_cdf_update       0
+  // 1     frame_size_override_flag 0
+  // 3     primary_ref_frame        1
+  // 8     refresh_frame_flags      4
+  // 3     ref_frame_idx[0]         0
+  // 3     ref_frame_idx[1]         1
+  // 3     ref_frame_idx[2]         2
+  // 3     ref_frame_idx[3]         3
+  // 3     ref_frame_idx[4]         4
+  // 3     ref_frame_idx[5]         5
+  // 3     ref_frame_idx[6]         6
+  // ...
+  const std::vector<uint8_t> kDefaultFrameHeaderInterFrame = {0x30, 0x41, 0x01,
+                                                              0x4e, 0x5c, 0x60};
+  const std::vector<uint8_t> kDefaultGlobalMotionParametersRotZoom = {
+      0xff, 0x50, 0x77, 0x7e, 0x1f, 0xcd};
+  const std::vector<uint8_t> kDefaultGlobalMotionParametersAffine = {
+      0x3f, 0x50, 0x77, 0x7b, 0xbf, 0xa8, 0x3e, 0x1f, 0xcd};
+
+  void SetUp() override {
+    buffer_pool_.reset(new (std::nothrow)
+                           BufferPool(nullptr, nullptr, nullptr, nullptr));
+    ASSERT_NE(buffer_pool_, nullptr);
+  }
+
+  bool Init() {
+    obu_.reset(new (std::nothrow) ObuParser(nullptr, 0, 0, buffer_pool_.get(),
+                                            &decoder_state_));
+    if (obu_ == nullptr) return false;
+    obu_headers_ = &obu_->obu_headers_;
+    obu_frame_header_ = &obu_->frame_header_;
+    obu_sequence_header_ = &obu_->sequence_header_;
+    return true;
+  }
+
+  bool Init(const std::vector<uint8_t>& data, bool init_bit_reader = true) {
+    obu_.reset(new (std::nothrow) ObuParser(
+        data.data(), data.size(), 0, buffer_pool_.get(), &decoder_state_));
+    if (obu_ == nullptr) return false;
+    obu_headers_ = &obu_->obu_headers_;
+    obu_frame_header_ = &obu_->frame_header_;
+    obu_sequence_header_ = &obu_->sequence_header_;
+    return init_bit_reader ? obu_->InitBitReader(data.data(), data.size())
+                           : true;
+  }
+
+  bool Parse(const std::string& input,
+             const ObuSequenceHeader* const sequence_header = nullptr) {
+    std::vector<uint8_t> data(input.begin(), input.end());
+    return Parse(data, sequence_header);
+  }
+
+  bool Parse(const std::vector<uint8_t>& data,
+             const ObuSequenceHeader* const sequence_header = nullptr) {
+    EXPECT_TRUE(Init(data, false));
+    if (sequence_header != nullptr) obu_->set_sequence_header(*sequence_header);
+    return obu_->ParseOneFrame(&current_frame_) == kStatusOk;
+  }
+
+  bool ParseSequenceHeader(const std::vector<uint8_t>& data) {
+    EXPECT_TRUE(Init(data));
+    return obu_->ParseSequenceHeader(/*seen_frame_header=*/false);
+  }
+
+  bool ParseFrameParameters(const std::vector<uint8_t>& data,
+                            bool id_bits_present = false,
+                            int force_screen_content_tools = 0,
+                            int force_integer_mv = 0,
+                            bool enable_superres = false) {
+    EXPECT_TRUE(Init(data));
+    if (id_bits_present) {
+      obu_->sequence_header_.frame_id_numbers_present = true;
+      obu_->sequence_header_.frame_id_length_bits = kFrameIdLengthBits;
+      obu_->sequence_header_.delta_frame_id_length_bits =
+          kDeltaFrameIdLengthBits;
+    }
+    obu_->sequence_header_.force_screen_content_tools =
+        force_screen_content_tools;
+    obu_->sequence_header_.force_integer_mv = force_integer_mv;
+    obu_->sequence_header_.enable_superres = enable_superres;
+    obu_->sequence_header_.frame_width_bits = kFrameWidthBits;
+    obu_->sequence_header_.frame_height_bits = kFrameHeightBits;
+    obu_->sequence_header_.max_frame_width = kWidth;
+    obu_->sequence_header_.max_frame_height = kHeight;
+    return obu_->ParseFrameParameters();
+  }
+
+  bool ParseSegmentationParameters(const std::vector<uint8_t>& data,
+                                   int primary_reference_frame,
+                                   int prev_frame_index) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.primary_reference_frame = primary_reference_frame;
+    if (primary_reference_frame != kPrimaryReferenceNone) {
+      obu_->frame_header_.reference_frame_index[primary_reference_frame] =
+          prev_frame_index;
+    }
+    return obu_->ParseSegmentationParameters();
+  }
+
+  bool ParseFrameReferenceModeSyntax(const std::vector<uint8_t>& data,
+                                     FrameType frame_type) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.frame_type = frame_type;
+    return obu_->ParseFrameReferenceModeSyntax();
+  }
+
+  bool ParseGlobalMotionParameters(const std::vector<uint8_t>& data,
+                                   FrameType frame_type) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.frame_type = frame_type;
+    obu_->frame_header_.primary_reference_frame = kPrimaryReferenceNone;
+    return obu_->ParseGlobalMotionParameters();
+  }
+
+  bool ParseFilmGrainParameters(const std::vector<uint8_t>& data,
+                                const ObuSequenceHeader& sequence_header,
+                                const ObuFrameHeader& frame_header) {
+    EXPECT_TRUE(Init(data));
+    obu_->set_sequence_header(sequence_header);
+    obu_->frame_header_ = frame_header;
+    return obu_->ParseFilmGrainParameters();
+  }
+
+  bool ParseTileInfoSyntax(const std::vector<uint8_t>& data, int columns4x4,
+                           int rows4x4, bool use_128x128_superblock) {
+    EXPECT_TRUE(Init(data));
+    obu_->frame_header_.columns4x4 = columns4x4;
+    obu_->frame_header_.rows4x4 = rows4x4;
+    obu_->sequence_header_.use_128x128_superblock = use_128x128_superblock;
+    return obu_->ParseTileInfoSyntax();
+  }
+
+  bool ParseMetadata(const std::vector<uint8_t>& data) {
+    EXPECT_TRUE(Init(data));
+    return obu_->ParseMetadata(data.data(), data.size());
+  }
+
+  void DefaultSequenceHeader(ObuSequenceHeader* const gold) {
+    memset(gold, 0, sizeof(*gold));
+    gold->profile = kProfile0;
+    gold->level[0].major = kMinimumMajorBitstreamLevel;
+    gold->operating_points = 1;
+    gold->max_frame_width = kWidth;
+    gold->max_frame_height = kHeight;
+    gold->frame_width_bits = kFrameWidthBits;
+    gold->frame_height_bits = kFrameHeightBits;
+    gold->use_128x128_superblock = true;
+    gold->enable_filter_intra = true;
+    gold->enable_intra_edge_filter = true;
+    gold->enable_interintra_compound = true;
+    gold->enable_masked_compound = true;
+    gold->enable_dual_filter = true;
+    gold->enable_order_hint = true;
+    gold->enable_jnt_comp = true;
+    gold->enable_ref_frame_mvs = true;
+    gold->choose_screen_content_tools = true;
+    gold->force_screen_content_tools = 2;
+    gold->choose_integer_mv = true;
+    gold->force_integer_mv = 2;
+    gold->order_hint_bits = 7;
+    gold->enable_cdef = true;
+    gold->enable_restoration = true;
+    gold->color_config.bitdepth = 8;
+    gold->color_config.color_primary = kColorPrimaryUnspecified;
+    gold->color_config.transfer_characteristics =
+        kTransferCharacteristicsUnspecified;
+    gold->color_config.matrix_coefficients = kMatrixCoefficientsUnspecified;
+    gold->color_config.subsampling_x = 1;
+    gold->color_config.subsampling_y = 1;
+  }
+
+  void DefaultFrameHeader(ObuFrameHeader* const gold, FrameType frame_type) {
+    memset(gold, 0, sizeof(*gold));
+    gold->frame_type = frame_type;
+    gold->show_frame = true;
+    gold->showable_frame = (frame_type != kFrameKey);
+    gold->enable_cdf_update = true;
+    gold->width = kWidth;
+    gold->height = kHeight;
+    gold->render_width = kWidth;
+    gold->render_height = kHeight;
+    gold->upscaled_width = kWidth;
+    gold->primary_reference_frame = kPrimaryReferenceNone;
+    gold->enable_frame_end_update_cdf = true;
+    gold->rows4x4 = kRows4x4;
+    gold->columns4x4 = kColumns4x4;
+    if (frame_type == kFrameKey) {
+      gold->refresh_frame_flags = 0xff;
+      gold->error_resilient_mode = true;
+      gold->force_integer_mv = 1;
+    } else if (frame_type == kFrameIntraOnly) {
+      gold->refresh_frame_flags = 4;
+      gold->force_integer_mv = 1;
+    } else if (frame_type == kFrameInter) {
+      gold->refresh_frame_flags = 4;
+      gold->primary_reference_frame = 1;
+      for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+        gold->reference_frame_index[i] = i;
+      }
+      gold->is_motion_mode_switchable = true;
+    }
+  }
+
+  void OverrideFrameSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+                         int flag_offset, int size_offset) {
+    data->SetBit(flag_offset, 1);  // frame_size_override_flag.
+    data->InsertLiteral(size_offset, kFrameWidthBits,
+                        kWidth - 2);  // frame_width_minus_1.
+    data->InsertLiteral(size_offset + kFrameWidthBits, kFrameHeightBits,
+                        kHeight - 2);  // frame_height_minus_1.
+    gold->frame_size_override_flag = true;
+    gold->width = kWidth - 1;
+    gold->height = kHeight - 1;
+    gold->render_width = gold->width;
+    gold->render_height = gold->height;
+    gold->upscaled_width = gold->width;
+  }
+
+  void OverrideRenderSize(BytesAndBits* const data, ObuFrameHeader* const gold,
+                          int flag_offset) {
+    data->SetBit(flag_offset, 1);  // render_and_frame_size_different.
+    data->InsertLiteral(flag_offset + 1, 16,
+                        kWidth - 10);  // render_width_minus_1.
+    data->InsertLiteral(flag_offset + 17, 16,
+                        kHeight - 10);  // render_height_minus_1.
+    gold->render_width = kWidth - 9;
+    gold->render_height = kHeight - 9;
+    gold->render_and_frame_size_different = true;
+  }
+
+  void OverrideSegmentation(BytesAndBits* const data, Segmentation* const gold,
+                            int offset) {
+    gold->update_data = true;
+    data->SetBit(offset++, static_cast<uint8_t>(gold->update_data));
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    gold->segment_id_pre_skip = false;
+    gold->last_active_segment_id = 0;
+    for (int i = 0; i < kMaxSegments; ++i) {
+      for (int j = 0; j < kSegmentFeatureMax; ++j) {
+        gold->feature_enabled[i][j] = static_cast<bool>(rnd.Rand8() & 1);
+        data->InsertBit(offset++,
+                        static_cast<uint8_t>(gold->feature_enabled[i][j]));
+        if (gold->feature_enabled[i][j]) {
+          gold->feature_data[i][j] = rnd(1 << kSegmentationFeatureBits[j]);
+          if (Segmentation::FeatureSigned(static_cast<SegmentFeature>(j))) {
+            if (static_cast<bool>(rnd.Rand8() & 1)) {
+              gold->feature_data[i][j] *= -1;
+            }
+            data->InsertInverseSignedLiteral(
+                offset, kSegmentationFeatureBits[j], gold->feature_data[i][j]);
+            offset += kSegmentationFeatureBits[j] + 1;
+          } else {
+            data->InsertLiteral(offset, kSegmentationFeatureBits[j],
+                                gold->feature_data[i][j]);
+            offset += kSegmentationFeatureBits[j];
+          }
+          gold->last_active_segment_id = i;
+          if (j >= kSegmentFeatureReferenceFrame) {
+            gold->segment_id_pre_skip = true;
+          }
+        }
+      }
+    }
+  }
+
+  void VerifyObuHeader(bool extension) {
+    EXPECT_EQ(obu_->obu_headers().back().temporal_id, extension ? 6 : 0);
+    EXPECT_EQ(obu_->obu_headers().back().spatial_id, extension ? 2 : 0);
+  }
+
+#define OBU_TEST_COMPARE(x) EXPECT_EQ(expected.x, actual.x)
+  void VerifyFrameParameters(const ObuFrameHeader& expected,
+                             bool id_bits_present = false) {
+    const ObuFrameHeader& actual = obu_->frame_header();
+    OBU_TEST_COMPARE(show_existing_frame);
+    if (actual.show_existing_frame) {
+      OBU_TEST_COMPARE(frame_to_show);
+      OBU_TEST_COMPARE(frame_presentation_time);
+      if (id_bits_present) {
+        OBU_TEST_COMPARE(display_frame_id);
+      }
+      return;
+    }
+    OBU_TEST_COMPARE(frame_type);
+    OBU_TEST_COMPARE(show_frame);
+    OBU_TEST_COMPARE(frame_presentation_time);
+    OBU_TEST_COMPARE(showable_frame);
+    OBU_TEST_COMPARE(error_resilient_mode);
+    OBU_TEST_COMPARE(enable_cdf_update);
+    OBU_TEST_COMPARE(current_frame_id);
+    OBU_TEST_COMPARE(frame_size_override_flag);
+    OBU_TEST_COMPARE(order_hint);
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      OBU_TEST_COMPARE(reference_order_hint[i]);
+    }
+    OBU_TEST_COMPARE(primary_reference_frame);
+    OBU_TEST_COMPARE(width);
+    OBU_TEST_COMPARE(height);
+    OBU_TEST_COMPARE(render_and_frame_size_different);
+    OBU_TEST_COMPARE(render_width);
+    OBU_TEST_COMPARE(render_height);
+    OBU_TEST_COMPARE(upscaled_width);
+    OBU_TEST_COMPARE(coded_lossless);
+    OBU_TEST_COMPARE(upscaled_lossless);
+    OBU_TEST_COMPARE(allow_screen_content_tools);
+    OBU_TEST_COMPARE(is_motion_mode_switchable);
+    OBU_TEST_COMPARE(refresh_frame_flags);
+    OBU_TEST_COMPARE(enable_frame_end_update_cdf);
+    OBU_TEST_COMPARE(force_integer_mv);
+    if (actual.frame_type == kFrameInter) {
+      for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+        OBU_TEST_COMPARE(reference_frame_index[i]);
+      }
+    }
+    OBU_TEST_COMPARE(use_superres);
+    OBU_TEST_COMPARE(rows4x4);
+    OBU_TEST_COMPARE(columns4x4);
+  }
+
+  void VerifyLoopFilterParameters(const LoopFilter& expected) {
+    const LoopFilter& actual = obu_->frame_header().loop_filter;
+    for (int i = 0; i < 4; ++i) {
+      OBU_TEST_COMPARE(level[i]);
+    }
+    OBU_TEST_COMPARE(sharpness);
+    OBU_TEST_COMPARE(delta_enabled);
+    OBU_TEST_COMPARE(delta_update);
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      OBU_TEST_COMPARE(ref_deltas[i]);
+    }
+    for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+      OBU_TEST_COMPARE(mode_deltas[i]);
+    }
+  }
+
+  void VerifyQuantizerParameters(const QuantizerParameters& expected) {
+    const QuantizerParameters& actual = obu_->frame_header().quantizer;
+    OBU_TEST_COMPARE(base_index);
+    OBU_TEST_COMPARE(delta_dc[kPlaneY]);
+    OBU_TEST_COMPARE(delta_dc[kPlaneU]);
+    OBU_TEST_COMPARE(delta_dc[kPlaneV]);
+    EXPECT_EQ(0, actual.delta_ac[kPlaneY]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneY]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneU]);
+    OBU_TEST_COMPARE(delta_ac[kPlaneV]);
+    OBU_TEST_COMPARE(use_matrix);
+    OBU_TEST_COMPARE(matrix_level[kPlaneY]);
+    OBU_TEST_COMPARE(matrix_level[kPlaneU]);
+    OBU_TEST_COMPARE(matrix_level[kPlaneV]);
+  }
+
+  void VerifySegmentationParameters(const Segmentation& expected) {
+    const Segmentation& actual = obu_->frame_header().segmentation;
+    OBU_TEST_COMPARE(enabled);
+    OBU_TEST_COMPARE(update_map);
+    OBU_TEST_COMPARE(update_data);
+    OBU_TEST_COMPARE(temporal_update);
+    OBU_TEST_COMPARE(segment_id_pre_skip);
+    OBU_TEST_COMPARE(last_active_segment_id);
+    for (int i = 0; i < kMaxSegments; ++i) {
+      for (int j = 0; j < kSegmentFeatureMax; ++j) {
+        OBU_TEST_COMPARE(feature_enabled[i][j]);
+        OBU_TEST_COMPARE(feature_data[i][j]);
+      }
+    }
+  }
+
+  void VerifyDeltaParameters(const Delta& expected, const Delta& actual) {
+    OBU_TEST_COMPARE(present);
+    OBU_TEST_COMPARE(scale);
+    OBU_TEST_COMPARE(multi);
+  }
+
+  void VerifyCdefParameters(const Cdef& expected) {
+    const Cdef& actual = obu_->frame_header().cdef;
+    OBU_TEST_COMPARE(damping);
+    OBU_TEST_COMPARE(bits);
+    for (int i = 0; i < (1 << actual.bits); ++i) {
+      OBU_TEST_COMPARE(y_primary_strength[i]);
+      OBU_TEST_COMPARE(y_secondary_strength[i]);
+      OBU_TEST_COMPARE(uv_primary_strength[i]);
+      OBU_TEST_COMPARE(uv_secondary_strength[i]);
+    }
+  }
+
+  void VerifyLoopRestorationParameters(const LoopRestoration& expected) {
+    const LoopRestoration& actual = obu_->frame_header().loop_restoration;
+    for (int i = 0; i < kMaxPlanes; ++i) {
+      OBU_TEST_COMPARE(type[i]);
+      OBU_TEST_COMPARE(unit_size_log2[i]);
+    }
+  }
+
+  void VerifyGlobalMotionParameters(
+      const std::array<GlobalMotion, kNumReferenceFrameTypes>& gold) {
+    for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+      const GlobalMotion& expected = gold[i];
+      const GlobalMotion& actual = obu_->frame_header().global_motion[i];
+      OBU_TEST_COMPARE(type) << " i: " << i;
+      for (int j = 0; j < 6; ++j) {
+        OBU_TEST_COMPARE(params[j]) << " i: " << i << " j: " << j;
+      }
+    }
+  }
+
+  void VerifyFilmGrainParameters(const FilmGrainParams& expected) {
+    const FilmGrainParams& actual = obu_->frame_header().film_grain_params;
+    OBU_TEST_COMPARE(apply_grain);
+    OBU_TEST_COMPARE(update_grain);
+    OBU_TEST_COMPARE(chroma_scaling_from_luma);
+    OBU_TEST_COMPARE(overlap_flag);
+    OBU_TEST_COMPARE(clip_to_restricted_range);
+    OBU_TEST_COMPARE(num_y_points);
+    OBU_TEST_COMPARE(num_u_points);
+    OBU_TEST_COMPARE(num_v_points);
+    for (int i = 0; i < 14; ++i) {
+      OBU_TEST_COMPARE(point_y_value[i]);
+      OBU_TEST_COMPARE(point_y_scaling[i]);
+    }
+    for (int i = 0; i < 10; ++i) {
+      OBU_TEST_COMPARE(point_u_value[i]);
+      OBU_TEST_COMPARE(point_u_scaling[i]);
+    }
+    for (int i = 0; i < 10; ++i) {
+      OBU_TEST_COMPARE(point_v_value[i]);
+      OBU_TEST_COMPARE(point_v_scaling[i]);
+    }
+    OBU_TEST_COMPARE(chroma_scaling);
+    OBU_TEST_COMPARE(auto_regression_coeff_lag);
+    for (int i = 0; i < 24; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_y[i]);
+    }
+    for (int i = 0; i < 25; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_u[i]);
+    }
+    for (int i = 0; i < 25; ++i) {
+      OBU_TEST_COMPARE(auto_regression_coeff_v[i]);
+    }
+    OBU_TEST_COMPARE(auto_regression_shift);
+    OBU_TEST_COMPARE(grain_seed);
+    OBU_TEST_COMPARE(reference_index);
+    OBU_TEST_COMPARE(grain_scale_shift);
+    OBU_TEST_COMPARE(u_multiplier);
+    OBU_TEST_COMPARE(u_luma_multiplier);
+    OBU_TEST_COMPARE(u_offset);
+    OBU_TEST_COMPARE(v_multiplier);
+    OBU_TEST_COMPARE(v_luma_multiplier);
+    OBU_TEST_COMPARE(v_offset);
+  }
+
+  void VerifyTileInfoParameters(const TileInfo& expected) {
+    const TileInfo& actual = obu_->frame_header().tile_info;
+    OBU_TEST_COMPARE(uniform_spacing);
+    OBU_TEST_COMPARE(tile_columns_log2);
+    OBU_TEST_COMPARE(tile_columns);
+    for (int i = 0; i < kMaxTileColumns + 1; ++i) {
+      OBU_TEST_COMPARE(tile_column_start[i]) << "tile_column: " << i;
+      OBU_TEST_COMPARE(tile_column_width_in_superblocks[i])
+          << "tile_column: " << i;
+    }
+    OBU_TEST_COMPARE(tile_rows_log2);
+    OBU_TEST_COMPARE(tile_rows);
+    for (int i = 0; i < kMaxTileRows + 1; ++i) {
+      OBU_TEST_COMPARE(tile_row_start[i]) << "tile_row: " << i;
+      OBU_TEST_COMPARE(tile_row_height_in_superblocks[i]) << "tile_rows: " << i;
+    }
+    OBU_TEST_COMPARE(tile_count);
+    OBU_TEST_COMPARE(context_update_id);
+    OBU_TEST_COMPARE(tile_size_bytes);
+  }
+
+  void VerifySequenceHeader(const ObuSequenceHeader& expected) {
+    EXPECT_TRUE(obu_->sequence_header_changed());
+    const ObuSequenceHeader& actual = obu_->sequence_header();
+    OBU_TEST_COMPARE(profile);
+    OBU_TEST_COMPARE(still_picture);
+    OBU_TEST_COMPARE(reduced_still_picture_header);
+    OBU_TEST_COMPARE(operating_points);
+    for (int i = 0; i < actual.operating_points; ++i) {
+      OBU_TEST_COMPARE(operating_point_idc[i]) << "i: " << i;
+      OBU_TEST_COMPARE(level[i].major) << "i: " << i;
+      OBU_TEST_COMPARE(level[i].minor) << "i: " << i;
+      OBU_TEST_COMPARE(tier[i]) << "i: " << i;
+    }
+    OBU_TEST_COMPARE(frame_width_bits);
+    OBU_TEST_COMPARE(frame_height_bits);
+    OBU_TEST_COMPARE(max_frame_width);
+    OBU_TEST_COMPARE(max_frame_height);
+    OBU_TEST_COMPARE(frame_id_numbers_present);
+    if (actual.frame_id_numbers_present) {
+      OBU_TEST_COMPARE(frame_id_length_bits);
+      OBU_TEST_COMPARE(delta_frame_id_length_bits);
+    }
+    OBU_TEST_COMPARE(use_128x128_superblock);
+    OBU_TEST_COMPARE(enable_filter_intra);
+    OBU_TEST_COMPARE(enable_intra_edge_filter);
+    OBU_TEST_COMPARE(enable_interintra_compound);
+    OBU_TEST_COMPARE(enable_masked_compound);
+    OBU_TEST_COMPARE(enable_warped_motion);
+    OBU_TEST_COMPARE(enable_dual_filter);
+    OBU_TEST_COMPARE(enable_order_hint);
+    OBU_TEST_COMPARE(enable_jnt_comp);
+    OBU_TEST_COMPARE(enable_ref_frame_mvs);
+    OBU_TEST_COMPARE(choose_screen_content_tools);
+    OBU_TEST_COMPARE(force_screen_content_tools);
+    OBU_TEST_COMPARE(choose_integer_mv);
+    OBU_TEST_COMPARE(force_integer_mv);
+    OBU_TEST_COMPARE(order_hint_bits);
+    OBU_TEST_COMPARE(enable_superres);
+    OBU_TEST_COMPARE(enable_cdef);
+    OBU_TEST_COMPARE(enable_restoration);
+    OBU_TEST_COMPARE(color_config.bitdepth);
+    OBU_TEST_COMPARE(color_config.is_monochrome);
+    OBU_TEST_COMPARE(color_config.color_range);
+    OBU_TEST_COMPARE(color_config.subsampling_x);
+    OBU_TEST_COMPARE(color_config.subsampling_y);
+    OBU_TEST_COMPARE(color_config.chroma_sample_position);
+    OBU_TEST_COMPARE(timing_info_present_flag);
+    OBU_TEST_COMPARE(timing_info.num_units_in_tick);
+    OBU_TEST_COMPARE(timing_info.time_scale);
+    OBU_TEST_COMPARE(timing_info.equal_picture_interval);
+    OBU_TEST_COMPARE(timing_info.num_ticks_per_picture);
+    OBU_TEST_COMPARE(decoder_model_info_present_flag);
+    OBU_TEST_COMPARE(decoder_model_info.encoder_decoder_buffer_delay_length);
+    OBU_TEST_COMPARE(decoder_model_info.num_units_in_decoding_tick);
+    OBU_TEST_COMPARE(decoder_model_info.buffer_removal_time_length);
+    OBU_TEST_COMPARE(decoder_model_info.frame_presentation_time_length);
+    for (int i = 0; i < actual.operating_points; ++i) {
+      SCOPED_TRACE("i: " + std::to_string(i));
+      OBU_TEST_COMPARE(operating_parameters.decoder_buffer_delay[i]);
+      OBU_TEST_COMPARE(operating_parameters.encoder_buffer_delay[i]);
+      OBU_TEST_COMPARE(operating_parameters.low_delay_mode_flag[i]);
+      OBU_TEST_COMPARE(initial_display_delay[i]);
+    }
+    OBU_TEST_COMPARE(film_grain_params_present);
+  }
+
+  void VerifyMetadata(MetadataType type, const ObuMetadata& expected) {
+    const ObuMetadata& actual = obu_->metadata();
+    switch (type) {
+      case kMetadataTypeHdrContentLightLevel:
+        OBU_TEST_COMPARE(max_cll);
+        OBU_TEST_COMPARE(max_fall);
+        break;
+      case kMetadataTypeHdrMasteringDisplayColorVolume:
+        for (int i = 0; i < 3; ++i) {
+          OBU_TEST_COMPARE(primary_chromaticity_x[i]);
+          OBU_TEST_COMPARE(primary_chromaticity_y[i]);
+        }
+        OBU_TEST_COMPARE(white_point_chromaticity_x);
+        OBU_TEST_COMPARE(white_point_chromaticity_y);
+        OBU_TEST_COMPARE(luminance_max);
+        OBU_TEST_COMPARE(luminance_min);
+        break;
+      case kMetadataTypeScalability:
+        break;
+      case kMetadataTypeItutT35:
+        OBU_TEST_COMPARE(itu_t_t35_country_code);
+        OBU_TEST_COMPARE(itu_t_t35_country_code_extension_byte);
+        ASSERT_EQ(expected.itu_t_t35_payload_size,
+                  actual.itu_t_t35_payload_size);
+        if (actual.itu_t_t35_payload_size != 0) {
+          EXPECT_EQ(memcmp(expected.itu_t_t35_payload_bytes.get(),
+                           actual.itu_t_t35_payload_bytes.get(),
+                           actual.itu_t_t35_payload_size),
+                    0);
+        }
+        break;
+      case kMetadataTypeTimecode:
+        break;
+    }
+  }
+
+#undef OBU_TEST_COMPARE
+
+  // Accessors to private members of ObuParser. This avoids the need for a
+  // dependency on a googletest header in the main library for FRIEND_TEST()
+  // (or the need to duplicate the implementation).
+  bool ObuParseFrameParameters() { return obu_->ParseFrameParameters(); }
+  bool ObuParseLoopFilterParameters() {
+    return obu_->ParseLoopFilterParameters();
+  }
+  bool ObuParseLoopFilterDeltaParameters() {
+    return obu_->ParseLoopFilterDeltaParameters();
+  }
+  bool ObuParseQuantizerParameters() {
+    return obu_->ParseQuantizerParameters();
+  }
+  bool ObuParseQuantizerIndexDeltaParameters() {
+    return obu_->ParseQuantizerIndexDeltaParameters();
+  }
+  void ObuComputeSegmentLosslessAndQIndex() {
+    obu_->ComputeSegmentLosslessAndQIndex();
+  }
+  bool ObuParseCdefParameters() { return obu_->ParseCdefParameters(); }
+  bool ObuParseLoopRestorationParameters() {
+    return obu_->ParseLoopRestorationParameters();
+  }
+  bool ObuParseTxModeSyntax() { return obu_->ParseTxModeSyntax(); }
+  bool ObuIsSkipModeAllowed() { return obu_->IsSkipModeAllowed(); }
+  bool ObuParseSkipModeParameters() { return obu_->ParseSkipModeParameters(); }
+  bool ObuReadAllowWarpedMotion() { return obu_->ReadAllowWarpedMotion(); }
+  bool ObuSetFrameReferences(int8_t last_frame_idx, int8_t gold_frame_idx) {
+    return obu_->SetFrameReferences(last_frame_idx, gold_frame_idx);
+  }
+
+  std::unique_ptr<BufferPool> buffer_pool_;
+  DecoderState decoder_state_;
+  std::unique_ptr<ObuParser> obu_;
+  // The following members are reset with each Init().
+  Vector<ObuHeader>* obu_headers_;
+  ObuFrameHeader* obu_frame_header_;
+  ObuSequenceHeader* obu_sequence_header_;
+  RefCountedBufferPtr current_frame_;
+};
+
+TEST_F(ObuParserTest, InvalidInputs) {
+  obu_.reset(new (std::nothrow)
+                 ObuParser(nullptr, 0, 0, buffer_pool_.get(), &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+  obu_.reset(new (std::nothrow) ObuParser(nullptr, 10, 0, buffer_pool_.get(),
+                                          &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+  obu_.reset(new (std::nothrow)
+                 ObuParser(kDefaultTemporalDelimiter.data(), 0, 0,
+                           buffer_pool_.get(), &decoder_state_));
+  EXPECT_EQ(obu_->ParseOneFrame(&current_frame_), kStatusInvalidArgument);
+}
+
+TEST_F(ObuParserTest, TemporalDelimiter) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultTemporalDelimiter);
+
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  EXPECT_EQ(obu_->obu_headers().size(), 1);
+  EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+  VerifyObuHeader(false);
+
+  // forbidden_bit is not zero.
+  data.SetBit(0, 1);
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderExtensions) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultTemporalDelimiterWithExtension);
+
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  EXPECT_EQ(obu_->obu_headers().size(), 1);
+  EXPECT_EQ(obu_->obu_headers().back().type, kObuTemporalDelimiter);
+  VerifyObuHeader(true);
+
+  // extension flag is set but no extensions found.
+  data.Clear();
+  data.AppendByte(kDefaultTemporalDelimiterWithExtension[0]);
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, HeaderHasSizeFieldNotSet) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultHeaderWithoutSizeField);
+
+  EXPECT_FALSE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeader) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderLevel) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  // Set level to 1.
+  gold.level[0].major = 2;
+  gold.level[0].minor = 1;
+  data.SetLiteral(24, 5, 1);  // level.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // Set operating_point_idc of operating point 1 to 0x101 (temporal layer 0
+  // and spatial layer 0 should be decoded). Set level of operating point 1 to
+  // 8 (4.0) and tier to 1.
+  gold.operating_points = 2;
+  gold.operating_point_idc[1] = (1 << 0) | (1 << (0 + 8));
+  gold.level[1].major = 4;
+  gold.level[1].minor = 0;
+  gold.tier[1] = 1;
+  data.SetLiteral(7, 5, gold.operating_points - 1);
+  data.InsertLiteral(29, 12, 0x101);  // operating_point_idc.
+  data.InsertLiteral(41, 5, 8);       // level.
+  data.InsertBit(46, gold.tier[1]);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderProfile) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.still_picture = true;
+  data.SetBit(3, static_cast<uint8_t>(gold.still_picture));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 8;
+  gold.profile = kProfile2;
+  gold.color_config.bitdepth = 8;
+  gold.color_config.subsampling_x = 1;
+  gold.color_config.subsampling_y = 0;
+  data.SetLiteral(0, 3, gold.profile);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 10;
+  gold.color_config.bitdepth = 10;
+  data.SetBit(73, 1);     // high_bitdepth.
+  data.InsertBit(74, 0);  // twelve_bit.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  // profile 2; bitdepth 12;
+  gold.color_config.bitdepth = 12;
+  gold.color_config.subsampling_y = 1;
+  data.SetBit(74, 1);     // twelve_bit.
+  data.InsertBit(78, 1);  // subsampling_x.
+  data.InsertBit(79, 1);  // subsampling_y.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderIdLength) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.frame_id_numbers_present = true;
+  gold.delta_frame_id_length_bits = kDeltaFrameIdLengthBits;
+  gold.frame_id_length_bits = kFrameIdLengthBits;
+  data.SetBit(54, 1);  // frame_id_numbers_present.
+  data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+  data.InsertLiteral(59, 3, kFrameIdLengthBits - kDeltaFrameIdLengthBits - 1);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// An idLen greater than 16 is invalid.
+TEST_F(ObuParserTest, SequenceHeaderIdLengthInvalid) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+
+  data.SetBit(54, 1);  // frame_id_numbers_present.
+  data.InsertLiteral(55, 4, kDeltaFrameIdLengthBits - 2);
+  data.InsertLiteral(59, 3, 17 - kDeltaFrameIdLengthBits - 1);  // idLen = 17.
+
+  ASSERT_FALSE(ParseSequenceHeader(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, SequenceHeaderFlags) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.enable_warped_motion = true;
+  gold.enable_superres = true;
+  data.SetBit(60, 1);  // enable_warped_motion.
+  data.SetBit(70, 1);  // enable_superres.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderForceScreenContentToolsEqualTo0) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.choose_screen_content_tools = false;
+  gold.force_screen_content_tools = 0;
+  gold.choose_integer_mv = false;
+  gold.force_integer_mv = 2;
+  data.SetBit(65, 0);  // choose_screen_content_tools.
+  data.SetBit(66, 0);  // force_screen_content_tools.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderMonochrome) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.color_config.is_monochrome = true;
+  gold.color_config.color_range = kColorRangeFull;
+  data.SetBit(74, 1);     // monochrome.
+  data.InsertBit(76, 1);  // color_range.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// This tests TimingInfo, DecoderModelInfo and OperatingParameters. The test is
+// kind of long but it is the simplest way to test all three since they are
+// dependent on one another.
+TEST_F(ObuParserTest, SequenceHeaderTimingInfo) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.timing_info_present_flag = true;
+  gold.timing_info.num_units_in_tick = 100;
+  gold.timing_info.time_scale = 1000;
+  gold.timing_info.equal_picture_interval = false;
+  gold.decoder_model_info_present_flag = false;
+  data.SetBit(5, static_cast<uint8_t>(gold.timing_info_present_flag));
+  data.InsertLiteral(6, 32, gold.timing_info.num_units_in_tick);
+  data.InsertLiteral(38, 32, gold.timing_info.time_scale);
+  data.InsertBit(70,
+                 static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+  data.InsertBit(71,
+                 static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.timing_info.equal_picture_interval = true;
+  gold.timing_info.num_ticks_per_picture = 7;
+  data.SetBit(70,
+              static_cast<uint8_t>(gold.timing_info.equal_picture_interval));
+  EXPECT_EQ(data.InsertUvlc(71, gold.timing_info.num_ticks_per_picture - 1), 5);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.decoder_model_info_present_flag = true;
+  gold.decoder_model_info.encoder_decoder_buffer_delay_length = 5;
+  gold.decoder_model_info.num_units_in_decoding_tick = 1000;
+  gold.decoder_model_info.buffer_removal_time_length = 18;
+  gold.decoder_model_info.frame_presentation_time_length = 20;
+
+  data.SetBit(76, static_cast<uint8_t>(gold.decoder_model_info_present_flag));
+  data.InsertLiteral(
+      77, 5, gold.decoder_model_info.encoder_decoder_buffer_delay_length - 1);
+  data.InsertLiteral(82, 32,
+                     gold.decoder_model_info.num_units_in_decoding_tick);
+  data.InsertLiteral(114, 5,
+                     gold.decoder_model_info.buffer_removal_time_length - 1);
+  data.InsertLiteral(
+      119, 5, gold.decoder_model_info.frame_presentation_time_length - 1);
+  data.InsertBit(147, 0);  // decoder_model_present_for_this_op.
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+
+  gold.operating_parameters.decoder_buffer_delay[0] = 10;
+  gold.operating_parameters.encoder_buffer_delay[0] = 20;
+  gold.operating_parameters.low_delay_mode_flag[0] = true;
+
+  data.SetBit(147, 1);  // decoder_model_present_for_this_op.
+  data.InsertLiteral(
+      148, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+      gold.operating_parameters.decoder_buffer_delay[0]);
+  data.InsertLiteral(
+      153, gold.decoder_model_info.encoder_decoder_buffer_delay_length,
+      gold.operating_parameters.encoder_buffer_delay[0]);
+  data.InsertBit(158, static_cast<uint8_t>(
+                          gold.operating_parameters.low_delay_mode_flag[0]));
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+TEST_F(ObuParserTest, SequenceHeaderInitialDisplayDelay) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultSequenceHeader);
+  ObuSequenceHeader gold;
+  DefaultSequenceHeader(&gold);
+
+  gold.initial_display_delay[0] = 8;
+
+  data.SetBit(6, 1);      // initial_display_delay_present_flag.
+  data.InsertBit(29, 1);  // initial_display_delay_present_for_this_op.
+  data.InsertLiteral(30, 4, gold.initial_display_delay[0] - 1);
+
+  ASSERT_TRUE(ParseSequenceHeader(data.GenerateData()));
+  VerifySequenceHeader(gold);
+}
+
+// Parsing of a frame header should fail if no sequence header has been
+// received.
+TEST_F(ObuParserTest, FrameHeaderWithoutSequenceHeader) {
+  // The aom-test-data test vector av1-1-b8-01-size-16x16.ivf has two temporal
+  // units. The first temporal unit has a presentation timestamp of 0 and
+  // consists of three OBUs: a temporal delimiter OBU, a sequence header OBU,
+  // and a frame OBU.
+  const std::vector<uint8_t> kTemporalDelimiter = {0x12, 0x00};
+  const std::vector<uint8_t> kSequenceHeader = {
+      0x0a, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x9f, 0xfb, 0xff, 0xf3, 0x00, 0x80};
+  const std::vector<uint8_t> kFrame = {
+      0x32, 0xa6, 0x01, 0x10, 0x00, 0x87, 0x80, 0x00, 0x03, 0x00, 0x00, 0x00,
+      0x40, 0x00, 0x9e, 0x86, 0x5b, 0xb2, 0x22, 0xb5, 0x58, 0x4d, 0x68, 0xe6,
+      0x37, 0x54, 0x42, 0x7b, 0x84, 0xce, 0xdf, 0x9f, 0xec, 0xab, 0x07, 0x4d,
+      0xf6, 0xe1, 0x5e, 0x9e, 0x27, 0xbf, 0x93, 0x2f, 0x47, 0x0d, 0x7b, 0x7c,
+      0x45, 0x8d, 0xcf, 0x26, 0xf7, 0x6c, 0x06, 0xd7, 0x8c, 0x2e, 0xf5, 0x2c,
+      0xb0, 0x8a, 0x31, 0xac, 0x69, 0xf5, 0xcd, 0xd8, 0x71, 0x5d, 0xaf, 0xf8,
+      0x96, 0x43, 0x8c, 0x9c, 0x23, 0x6f, 0xab, 0xd0, 0x35, 0x43, 0xdf, 0x81,
+      0x12, 0xe3, 0x7d, 0xec, 0x22, 0xb0, 0x30, 0x54, 0x32, 0x9f, 0x90, 0xc0,
+      0x5d, 0x64, 0x9b, 0x0f, 0x75, 0x31, 0x84, 0x3a, 0x57, 0xd7, 0x5f, 0x03,
+      0x6e, 0x7f, 0x43, 0x17, 0x6d, 0x08, 0xc3, 0x81, 0x8a, 0xae, 0x73, 0x1c,
+      0xa8, 0xa7, 0xe4, 0x9c, 0xa9, 0x5b, 0x3f, 0xd1, 0xeb, 0x75, 0x3a, 0x7f,
+      0x22, 0x77, 0x38, 0x64, 0x1c, 0x77, 0xdb, 0xcd, 0xef, 0xb7, 0x08, 0x45,
+      0x8e, 0x7f, 0xea, 0xa3, 0xd0, 0x81, 0xc9, 0xc1, 0xbc, 0x93, 0x9b, 0x41,
+      0xb1, 0xa1, 0x42, 0x17, 0x98, 0x3f, 0x1e, 0x95, 0xdf, 0x68, 0x7c, 0xb7,
+      0x98};
+
+  BytesAndBits data;
+  data.AppendBytes(kTemporalDelimiter);
+  // Skip the sequence header OBU.
+  data.AppendBytes(kFrame);
+  ASSERT_FALSE(Parse(data.GenerateData()));
+
+  // Now verify that all three OBUs are correct, by adding them to |data|
+  // successively.
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  data.AppendBytes(kSequenceHeader);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+  data.Clear();
+  data.AppendBytes(kTemporalDelimiter);
+  data.AppendBytes(kSequenceHeader);
+  data.AppendBytes(kFrame);
+  ASSERT_TRUE(Parse(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrame) {
+  BytesAndBits data;
+  data.AppendBit(1);                    // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);  // frame_to_show.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+
+  // kFrameToShow'th frame is not yet decoded.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  // kFrameToShow'th frame is not a showable frame.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParametersShowExistingFrameWithDisplayFrameId) {
+  BytesAndBits data;
+  data.AppendBit(1);                        // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);      // frame_to_show.
+  data.AppendLiteral(15, kDisplayFrameId);  // display_frame_id.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+  gold.display_frame_id = kDisplayFrameId;
+
+  // kFrameToShow'th frame is not yet decoded.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+  decoder_state_.reference_frame_id[kFrameToShow] = kDisplayFrameId;
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  // kFrameToShow'th frame is not a showable frame.
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData(), true));
+
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), true));
+  VerifyFrameParameters(gold, true);
+}
+
+TEST_F(ObuParserTest, FrameParameterShowExistingFrameTemporalPointInfo) {
+  BytesAndBits data;
+  data.AppendBit(1);                    // show_existing_frame.
+  data.AppendLiteral(3, kFrameToShow);  // frame_to_show.
+  data.AppendLiteral(20, 38);           // frame_presentation_time.
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.show_existing_frame = true;
+  gold.frame_to_show = kFrameToShow;
+  gold.frame_presentation_time = 38;
+
+  EXPECT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+  obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+  obu_sequence_header_->max_frame_width = kWidth;
+  obu_sequence_header_->max_frame_height = kHeight;
+
+  obu_sequence_header_->decoder_model_info_present_flag = true;
+  obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+  decoder_state_.reference_frame[kFrameToShow] = buffer_pool_->GetFreeBuffer();
+  decoder_state_.reference_frame[kFrameToShow]->set_showable_frame(true);
+
+  ASSERT_TRUE(ObuParseFrameParameters());
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterErrorResilientMode) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  gold.error_resilient_mode = true;
+  data.SetBit(4, static_cast<uint8_t>(gold.error_resilient_mode));
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameTemporalPointInfo) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  data.InsertLiteral(4, 20, 38);  // frame_presentation_time.
+  gold.frame_presentation_time = 38;
+
+  EXPECT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->frame_width_bits = kFrameWidthBits;
+  obu_sequence_header_->frame_height_bits = kFrameHeightBits;
+  obu_sequence_header_->max_frame_width = kWidth;
+  obu_sequence_header_->max_frame_height = kHeight;
+
+  obu_sequence_header_->decoder_model_info_present_flag = true;
+  obu_sequence_header_->decoder_model_info.frame_presentation_time_length = 20;
+
+  ASSERT_TRUE(ObuParseFrameParameters());
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  OverrideFrameSize(&data, &gold, 5, 6);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 23);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameSuperRes) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+  gold.use_superres = true;
+  gold.superres_scale_denominator = 15;
+  gold.width = kWidth * 8 / 15;
+  gold.columns4x4 = 58;
+
+  data.SetBit(6, static_cast<int>(gold.use_superres));
+  data.SetLiteral(7, 3, gold.superres_scale_denominator - 9);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 0, 0, true));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterKeyFrameAllowScreenContentTools) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderKeyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameKey);
+
+  data.InsertBit(5, 1);  // allow_screen_content_tools.
+  data.InsertBit(8, 1);  // allow_intrabc.
+  gold.allow_screen_content_tools = true;
+  gold.allow_intrabc = true;
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2));
+  VerifyFrameParameters(gold);
+
+  data.InsertBit(6, 1);  // force_integer_mv.
+  gold.force_integer_mv = 1;
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+  VerifyFrameParameters(gold);
+
+  data.SetBit(6, 0);  // force_integer_mv.
+
+  // Gold need not be updated, because force_integer_mv is always 1 for
+  // keyframes.
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData(), false, 2, 2));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameIntraOnly);
+
+  OverrideFrameSize(&data, &gold, 6, 15);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 32);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+// An INTRA_ONLY_FRAME cannot set refresh_frame_flags to 0xff.
+TEST_F(ObuParserTest, FrameParameterIntraOnlyFrameRefreshAllFrames) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderIntraOnlyFrame);
+  data.SetLiteral(7, 8, 0xFF);  // refresh_frame_flags.
+
+  ASSERT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrame) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = kHeight;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = kHeight;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = kRows4x4;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+TEST_F(ObuParserTest, FrameParameterInterFrameOverrideSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = kHeight;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = kHeight;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = kRows4x4;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  data.InsertLiteral(39, kNumInterReferenceFrameTypes, 0);  // found_ref.
+  OverrideFrameSize(&data, &gold, 6, 46);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+
+  OverrideRenderSize(&data, &gold, 63);
+
+  ASSERT_TRUE(ParseFrameParameters(data.GenerateData()));
+  VerifyFrameParameters(gold);
+}
+
+// This test verifies we check the following requirement at the end of Section
+// 6.8.4:
+//   If FrameIsIntra is equal to 0 (indicating that this frame may use inter
+//   prediction), the requirements described in the frame size with refs
+//   semantics of section 6.8.6 must also be satisfied.
+TEST_F(ObuParserTest, FrameParameterInterFrameInvalidSize) {
+  BytesAndBits data;
+  data.AppendBytes(kDefaultFrameHeaderInterFrame);
+  ObuFrameHeader gold;
+  DefaultFrameHeader(&gold, kFrameInter);
+  ObuFrameHeader reference_frame_header;
+  reference_frame_header.width = kWidth;
+  reference_frame_header.height = 2 * kHeight + 8;
+  reference_frame_header.render_width = kWidth;
+  reference_frame_header.render_height = 2 * kHeight + 8;
+  reference_frame_header.upscaled_width = kWidth;
+  reference_frame_header.rows4x4 = 2 * kRows4x4 + 2;
+  reference_frame_header.columns4x4 = kColumns4x4;
+  reference_frame_header.refresh_frame_flags = 0;
+  for (auto& reference_frame : decoder_state_.reference_frame) {
+    reference_frame = buffer_pool_->GetFreeBuffer();
+    EXPECT_TRUE(reference_frame->SetFrameDimensions(reference_frame_header));
+  }
+
+  EXPECT_FALSE(ParseFrameParameters(data.GenerateData()));
+}
+
+// Tests the ObuParser::SetFrameReferences() method.
+//
+// This method uses the following data members as input:
+//   decoder_state_.reference_order_hint
+//   sequence_header_.enable_order_hint
+//   sequence_header_.order_hint_bits
+//   frame_header_.order_hint
+// So we need to set up these data members before calling
+// ObuParser::SetFrameReferences().
+//
+// The output is in frame_header_.reference_frame_index.
+TEST_F(ObuParserTest, SetFrameReferences) {
+  // All reference frames are forward references (because 9 < 17).
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    decoder_state_.reference_order_hint[i] = 9;
+  }
+
+  ASSERT_TRUE(Init());
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 5;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  obu_frame_header_->order_hint = 17;
+
+  const int8_t last_frame_idx = 0;
+  const int8_t gold_frame_idx = 1;
+
+  // Since all reference frames are forward references, we set the remaining
+  // five references in reverse chronological order. So Last2, Last3, Backward,
+  // Alternate2, and Alternate are set to 7, 6, 5, 4, and 3, respectively.
+
+  EXPECT_TRUE(ObuSetFrameReferences(last_frame_idx, gold_frame_idx));
+
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast - kReferenceFrameLast],
+      0);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast2 - kReferenceFrameLast],
+      7);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameLast3 - kReferenceFrameLast],
+      6);
+  EXPECT_EQ(
+      obu_frame_header_
+          ->reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast],
+      1);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameBackward -
+                                                     kReferenceFrameLast],
+            5);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate2 -
+                                                     kReferenceFrameLast],
+            4);
+  EXPECT_EQ(obu_frame_header_->reference_frame_index[kReferenceFrameAlternate -
+                                                     kReferenceFrameLast],
+            3);
+}
+
+TEST_F(ObuParserTest, LoopFilterParameters) {
+  LoopFilter gold;
+  memset(&gold, 0, sizeof(gold));
+
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  obu_frame_header_->coded_lossless = true;
+  gold.ref_deltas[kReferenceFrameIntra] = 1;
+  gold.ref_deltas[kReferenceFrameGolden] = -1;
+  gold.ref_deltas[kReferenceFrameAlternate] = -1;
+  gold.ref_deltas[kReferenceFrameAlternate2] = -1;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  gold.level[0] = 32;
+  gold.level[3] = 48;
+  gold.sharpness = 4;
+  data.Clear();
+  for (const auto& level : gold.level) {
+    data.AppendLiteral(6, level);
+  }
+  data.AppendLiteral(3, gold.sharpness);
+  data.AppendBit(0);  // delta_enabled.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+
+  gold.delta_enabled = true;
+  gold.delta_update = true;
+  gold.ref_deltas[0] = 20;
+  gold.mode_deltas[0] = -20;
+  data.SetBit(27, 1);  // delta_enabled.
+  data.AppendBit(1);   // delta_update.
+  for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+    if (i == 0) {
+      data.AppendBit(1);  // update_ref_delta.
+      data.AppendInverseSignedLiteral(6, gold.ref_deltas[0]);  // ref_delta.
+    } else {
+      data.AppendBit(0);  // update_ref_delta.
+    }
+  }
+  for (int i = 0; i < kLoopFilterMaxModeDeltas; ++i) {
+    if (i == 0) {
+      data.AppendBit(1);  // update_mode_delta.
+      data.AppendInverseSignedLiteral(6, gold.mode_deltas[0]);  // mode_delta.
+    } else {
+      data.AppendBit(0);  // update_mode_delta.
+    }
+  }
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->primary_reference_frame = kPrimaryReferenceNone;
+  ASSERT_TRUE(ObuParseLoopFilterParameters());
+  VerifyLoopFilterParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParameters) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendLiteral(3, 0);  // delta_coded.
+  data.AppendBit(0);         // use_matrix.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersMonochrome) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendBit(0);  // delta_coded.
+  data.AppendBit(0);  // use_matrix.
+  // The quantizer parameters end here. Add a 1 bit. It should not be parsed.
+  data.AppendBit(1);  // Would be segmentation_enabled in a bitstream.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.is_monochrome = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersDeltaCoded) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+  gold.delta_dc[kPlaneY] = -30;
+
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendBit(1);  // delta_coded.
+  data.AppendInverseSignedLiteral(6, gold.delta_dc[kPlaneY]);
+  data.AppendLiteral(2, 0);  // delta_coded u dc/ac.
+  data.AppendBit(0);         // use_matrix.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_dc[kPlaneU] = -40;
+  gold.delta_dc[kPlaneV] = gold.delta_dc[kPlaneU];
+  data.SetBit(16, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(17, 6, gold.delta_dc[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_ac[kPlaneU] = 50;
+  gold.delta_ac[kPlaneV] = gold.delta_ac[kPlaneU];
+  data.SetBit(24, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(25, 6, gold.delta_ac[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_dc[kPlaneV] = 60;
+  gold.delta_ac[kPlaneV] = 0;
+  data.InsertBit(16, 1);  // diff_uv_delta.
+  data.InsertBit(33, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(34, 6, gold.delta_dc[kPlaneV]);
+  data.InsertBit(41, 0);  // delta_coded.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  gold.delta_ac[kPlaneV] = -20;
+  data.SetBit(41, 1);  // delta_coded.
+  data.InsertInverseSignedLiteral(42, 6, gold.delta_ac[kPlaneV]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerParametersUseQmatrix) {
+  QuantizerParameters gold = {};
+  gold.base_index = 48;
+  gold.use_matrix = true;
+  gold.matrix_level[kPlaneY] = 3;
+  gold.matrix_level[kPlaneU] = 6;
+  gold.matrix_level[kPlaneV] = gold.matrix_level[kPlaneU];
+
+  // Test three cases.
+  // 1. separate_uv_delta_q = false (which implies diff_uv_delta = false).
+  BytesAndBits data;
+  data.AppendLiteral(8, gold.base_index);
+  data.AppendLiteral(3, 0);  // delta_coded.
+  data.AppendBit(static_cast<uint8_t>(gold.use_matrix));
+  data.AppendLiteral(4, gold.matrix_level[kPlaneY]);
+  data.AppendLiteral(4, gold.matrix_level[kPlaneU]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  // 2. separate_uv_delta_q = true and diff_uv_delta = false.
+  gold.matrix_level[kPlaneV] = 5;
+  data.InsertBit(9, 0);  // diff_uv_delta.
+  data.AppendLiteral(4, gold.matrix_level[kPlaneV]);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+
+  // 3. separate_uv_delta_q = true and diff_uv_delta = true.
+  data.SetBit(9, 1);             // diff_uv_delta.
+  data.InsertLiteral(12, 2, 0);  // delta_coded.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.separate_uv_delta_q = true;
+  ASSERT_TRUE(ObuParseQuantizerParameters());
+  VerifyQuantizerParameters(gold);
+}
+
+TEST_F(ObuParserTest, SegmentationParameters) {
+  const int kPrimaryReferenceNotNone = 1;
+  const int kPrevFrameIndexNotNone = 2;
+
+  // Set up decoder_state_ with a previous frame containing saved segmentation
+  // parameters.
+  decoder_state_.reference_frame[kPrevFrameIndexNotNone] =
+      buffer_pool_->GetFreeBuffer();
+  ASSERT_NE(decoder_state_.reference_frame[kPrevFrameIndexNotNone], nullptr);
+  Segmentation prev_segmentation = {};
+  prev_segmentation.feature_enabled[2][0] = true;
+  prev_segmentation.feature_enabled[5][0] = true;
+  prev_segmentation.last_active_segment_id = 5;
+  decoder_state_.reference_frame[kPrevFrameIndexNotNone]
+      ->SetSegmentationParameters(prev_segmentation);
+
+  Segmentation gold;
+  memset(&gold, 0, sizeof(gold));
+
+  BytesAndBits data;
+  data.AppendBit(0);  // segmentation_enabled.
+
+  // Since segmentation_enabled is false, we expect the parameters to be all
+  // zero/false.
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  gold.enabled = true;
+  gold.update_map = true;
+  gold.temporal_update = true;
+  data.SetBit(0, static_cast<uint8_t>(gold.enabled));
+  data.AppendBit(static_cast<uint8_t>(gold.update_map));
+  data.AppendBit(static_cast<uint8_t>(gold.temporal_update));
+  data.AppendBit(static_cast<uint8_t>(gold.update_data));
+
+  // Since update_data is false, we expect the parameters to be loaded from the
+  // previous frame in |decoder_state_|. So change |gold| accordingly.
+  gold.feature_enabled[2][0] = true;
+  gold.feature_enabled[5][0] = true;
+  gold.last_active_segment_id = 5;
+
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  OverrideSegmentation(&data, &gold, 3);
+
+  ASSERT_TRUE(ParseSegmentationParameters(
+      data.GenerateData(), kPrimaryReferenceNotNone, kPrevFrameIndexNotNone));
+  VerifySegmentationParameters(gold);
+
+  // If primary_ref_frame is kPrimaryReferenceNone, these three fields are
+  // implied.
+  data.RemoveBit(1);  // segmentation_update_map.
+  data.RemoveBit(1);  // segmentation_temporal_update.
+  data.RemoveBit(1);  // segmentation_update_data.
+  gold.update_map = true;
+  gold.temporal_update = false;
+  gold.update_data = true;
+
+  // Since update_data is true, we expect the parameters to be read from
+  // |data|.
+  ASSERT_TRUE(ParseSegmentationParameters(data.GenerateData(),
+                                          kPrimaryReferenceNone, 0));
+  VerifySegmentationParameters(gold);
+}
+
+TEST_F(ObuParserTest, QuantizerIndexDeltaParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);         // delta_q_present.
+  data.AppendLiteral(2, 2);  // delta_q_res.
+
+  Delta gold;
+  memset(&gold, 0, sizeof(gold));
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+
+  gold.present = true;
+  gold.scale = 2;
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->quantizer.base_index = 40;
+  ASSERT_TRUE(ObuParseQuantizerIndexDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_q);
+}
+
+TEST_F(ObuParserTest, LoopFilterDeltaParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);         // delta_lf_present.
+  data.AppendLiteral(2, 2);  // delta_lf_res.
+  data.AppendBit(1);         // delta_lf_multi.
+
+  Delta gold;
+  memset(&gold, 0, sizeof(gold));
+
+  // delta_q_present is false, so loop filter delta will not be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+  // allow_intrabc is true, so loop filter delta will not be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->delta_q.present = true;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+
+  gold.present = true;
+  gold.scale = 2;
+  gold.multi = true;
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->delta_q.present = true;
+  ASSERT_TRUE(ObuParseLoopFilterDeltaParameters());
+  VerifyDeltaParameters(gold, obu_->frame_header().delta_lf);
+}
+
+TEST_F(ObuParserTest, ComputeSegmentLosslessAndQIndex) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+
+  // Segmentation is disabled. All quantizers are 0.
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is enabled. All quantizers are zero.
+  obu_frame_header_->segmentation.enabled = true;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_TRUE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is enabled. All quantizers are zero. upscaled_width != width.
+  obu_frame_header_->segmentation.enabled = true;
+  obu_frame_header_->upscaled_width = 100;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_TRUE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation in disabled. Some quantizer deltas are non zero.
+  obu_frame_header_->segmentation.enabled = false;
+  obu_frame_header_->quantizer.delta_dc[kPlaneY] = 40;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_FALSE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 0);
+  }
+
+  // Segmentation is disabled. Quantizer base index is non zero.
+  obu_frame_header_->segmentation.enabled = true;
+  obu_frame_header_->quantizer.delta_dc[kPlaneY] = 0;
+  obu_frame_header_->quantizer.base_index = 40;
+  ObuComputeSegmentLosslessAndQIndex();
+  EXPECT_FALSE(obu_->frame_header().coded_lossless);
+  EXPECT_FALSE(obu_->frame_header().upscaled_lossless);
+  for (const auto& qindex : obu_->frame_header().segmentation.qindex) {
+    EXPECT_EQ(qindex, 40);
+  }
+}
+
+TEST_F(ObuParserTest, CdefParameters) {
+  Cdef gold;
+  memset(&gold, 0, sizeof(gold));
+  const int coeff_shift = 2;  // bitdepth - 8.
+  gold.damping = 3 + coeff_shift;
+
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->color_config.bitdepth = 10;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because enable_cdef is false.
+  VerifyCdefParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  obu_frame_header_->coded_lossless = true;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because coded_lossless is true.
+  VerifyCdefParameters(gold);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  obu_frame_header_->allow_intrabc = true;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  // Cdef will be {0} except for damping because allow_intrabc is true.
+  VerifyCdefParameters(gold);
+
+  gold.damping = 5;
+  gold.bits = 1;
+  data.Clear();
+  data.AppendLiteral(2, gold.damping - 3);  // cdef_damping_minus3.
+  gold.damping += coeff_shift;
+  data.AppendLiteral(2, gold.bits);  // cdef_bits.
+  for (int i = 0; i < 2; ++i) {
+    gold.y_primary_strength[i] = 10;
+    gold.y_secondary_strength[i] = (i == 0) ? 2 : 3;
+    gold.uv_primary_strength[i] = 12;
+    gold.uv_secondary_strength[i] = (i == 1) ? 2 : 3;
+    data.AppendLiteral(4, gold.y_primary_strength[i]);
+    data.AppendLiteral(2, gold.y_secondary_strength[i]);
+    data.AppendLiteral(4, gold.uv_primary_strength[i]);
+    data.AppendLiteral(2, gold.uv_secondary_strength[i]);
+    if (gold.y_secondary_strength[i] == 3) ++gold.y_secondary_strength[i];
+    if (gold.uv_secondary_strength[i] == 3) ++gold.uv_secondary_strength[i];
+    gold.y_primary_strength[i] <<= coeff_shift;
+    gold.uv_primary_strength[i] <<= coeff_shift;
+    gold.y_secondary_strength[i] <<= coeff_shift;
+    gold.uv_secondary_strength[i] <<= coeff_shift;
+  }
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_sequence_header_->enable_cdef = true;
+  obu_sequence_header_->color_config.bitdepth = 10;
+  ASSERT_TRUE(ObuParseCdefParameters());
+  VerifyCdefParameters(gold);
+}
+
+TEST_F(ObuParserTest, LoopRestorationParameters) {
+  for (bool use_128x128_superblock : testing::Bool()) {
+    SCOPED_TRACE("use_128x128_superblock: " +
+                 std::to_string(use_128x128_superblock));
+    LoopRestoration gold;
+    memset(&gold, 0, sizeof(gold));
+
+    BytesAndBits data;
+    data.AppendBit(0);  // dummy.
+
+    // enable_restoration is false. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->allow_intrabc = true;
+    obu_frame_header_->coded_lossless = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    // allow_intrabc is true. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->allow_intrabc = true;
+    obu_sequence_header_->enable_restoration = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    // coded_lossless is true. nothing will be read.
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_frame_header_->coded_lossless = true;
+    obu_sequence_header_->enable_restoration = true;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    data.Clear();
+    for (int i = 0; i < kMaxPlanes; ++i) {
+      data.AppendLiteral(2, kLoopRestorationTypeNone);  // lr_type.
+    }
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    gold.type[0] = gold.type[1] = kLoopRestorationTypeWiener;
+    gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+        use_128x128_superblock ? 8 : 7;
+    data.SetLiteral(0, 2, gold.type[0]);  // lr_type.
+    data.SetLiteral(2, 2, gold.type[0]);  // lr_type.
+    data.AppendBit(1);                    // lr_unit_shift.
+    if (!use_128x128_superblock) {
+      data.AppendBit(0);  // lr_unit_extra_shift.
+    }
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+
+    if (!use_128x128_superblock) {
+      gold.unit_size_log2[0] = gold.unit_size_log2[1] = gold.unit_size_log2[2] =
+          8;
+      data.SetBit(7, 1);  // lr_unit_extra_shift.
+
+      ASSERT_TRUE(Init(data.GenerateData()));
+      obu_sequence_header_->enable_restoration = true;
+      obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+      ASSERT_TRUE(ObuParseLoopRestorationParameters());
+      VerifyLoopRestorationParameters(gold);
+    }
+
+    gold.unit_size_log2[1] = gold.unit_size_log2[2] = 7;
+    data.AppendBit(1);  // lr_uv_shift.
+
+    ASSERT_TRUE(Init(data.GenerateData()));
+    obu_sequence_header_->enable_restoration = true;
+    obu_sequence_header_->use_128x128_superblock = use_128x128_superblock;
+    obu_sequence_header_->color_config.subsampling_x = 1;
+    obu_sequence_header_->color_config.subsampling_y = 1;
+    ASSERT_TRUE(ObuParseLoopRestorationParameters());
+    VerifyLoopRestorationParameters(gold);
+  }
+}
+
+TEST_F(ObuParserTest, TxModeSyntax) {
+  BytesAndBits data;
+  data.AppendBit(1);  // tx_mode_select.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeSelect, obu_->frame_header().tx_mode);
+
+  data.SetBit(0, 0);  // tx_mode_select.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeLargest, obu_->frame_header().tx_mode);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->coded_lossless = true;
+  ASSERT_TRUE(ObuParseTxModeSyntax());
+  EXPECT_EQ(kTxModeOnly4x4, obu_->frame_header().tx_mode);
+}
+
+TEST_F(ObuParserTest, FrameReferenceModeSyntax) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameKey));
+  EXPECT_FALSE(obu_->frame_header().reference_mode_select);
+
+  data.SetBit(0, 1);  // reference_mode_select.
+
+  ASSERT_TRUE(ParseFrameReferenceModeSyntax(data.GenerateData(), kFrameInter));
+  EXPECT_TRUE(obu_->frame_header().reference_mode_select);
+}
+
+TEST_F(ObuParserTest, SkipModeParameters) {
+  BytesAndBits data;
+  data.AppendBit(1);  // skip_mode_present.
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameKey;
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_frame_header_->order_hint = 1;
+  decoder_state_.order_hint = 1;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_FALSE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_FALSE(obu_->frame_header().skip_mode_present);
+
+  ASSERT_TRUE(Init(data.GenerateData()));
+  for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) {
+    obu_frame_header_->reference_frame_index[i] = i;
+    decoder_state_.reference_order_hint[i] = i;
+  }
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->reference_mode_select = true;
+  obu_frame_header_->order_hint = 1;
+  decoder_state_.order_hint = 1;
+  obu_sequence_header_->enable_order_hint = true;
+  obu_sequence_header_->order_hint_bits = 7;
+  obu_sequence_header_->order_hint_shift_bits =
+      Mod32(32 - obu_sequence_header_->order_hint_bits);
+  ASSERT_TRUE(ObuIsSkipModeAllowed());
+  ASSERT_TRUE(ObuParseSkipModeParameters());
+  EXPECT_TRUE(obu_->frame_header().skip_mode_present);
+}
+
+TEST_F(ObuParserTest, AllowWarpedMotion) {
+  BytesAndBits data;
+  data.AppendBit(0xff);  // dummy.
+
+  // IsIntraFrame is true, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameKey;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // error_resilient_mode is true, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = true;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // enable_warped_motion is false, so nothing will be read.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = false;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_FALSE(obu_->frame_header().allow_warped_motion);
+
+  // allow_warped_motion will be read and equal to true.
+  ASSERT_TRUE(Init(data.GenerateData()));
+  obu_frame_header_->frame_type = kFrameInter;
+  obu_frame_header_->error_resilient_mode = false;
+  obu_sequence_header_->enable_warped_motion = true;
+  ASSERT_TRUE(ObuReadAllowWarpedMotion());
+  EXPECT_TRUE(obu_->frame_header().allow_warped_motion);
+}
+
+TEST_F(ObuParserTest, GlobalMotionParameters) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> gold;
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    gold[i].type = kGlobalMotionTransformationTypeIdentity;
+    for (int j = 0; j < 6; ++j) {
+      gold[i].params[j] = (j % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0;
+    }
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameKey));
+  VerifyGlobalMotionParameters(gold);
+
+  data.Clear();
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    // is_global=1; is_rot_zoom=1; parameter_values;
+    data.AppendBytes(kDefaultGlobalMotionParametersRotZoom);
+
+    // Magic numbers based on kDefaultGlobalMotionParametersRotZoom.
+    gold[i].type = kGlobalMotionTransformationTypeRotZoom;
+    gold[i].params[0] = -73728;
+    gold[i].params[1] = -23552;
+    gold[i].params[2] = 65952;
+    gold[i].params[3] = -62;
+    gold[i].params[4] = 62;
+    gold[i].params[5] = 65952;
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+  VerifyGlobalMotionParameters(gold);
+
+  data.Clear();
+  for (int i = kReferenceFrameLast; i <= kReferenceFrameAlternate; ++i) {
+    // This bit is not part of the hex string because it would make the whole
+    // string not align to 8 bits. Appending this separately so that we can keep
+    // the rest of them a magic hex string.
+    data.AppendBit(1);  // is_global.
+    // is_rot_zoom=0; is_translation=0; parameter_values;
+    data.AppendBytes(kDefaultGlobalMotionParametersAffine);
+
+    // Magic numbers based on kDefaultGlobalMotionParametersAffine.
+    gold[i].type = kGlobalMotionTransformationTypeAffine;
+    gold[i].params[4] = -62;
+  }
+
+  ASSERT_TRUE(ParseGlobalMotionParameters(data.GenerateData(), kFrameInter));
+  VerifyGlobalMotionParameters(gold);
+}
+
+TEST_F(ObuParserTest, FilmGrainParameters) {
+  BytesAndBits data;
+  data.AppendBit(0);  // dummy.
+
+  // Test film grain not present.
+  FilmGrainParams gold = {};
+  ObuSequenceHeader sequence_header = {};
+  sequence_header.film_grain_params_present = false;
+  ObuFrameHeader frame_header = {};
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if show_frame = false and showable_frame = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = false;
+  frame_header.showable_frame = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if apply_grain = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = true;
+  frame_header.showable_frame = true;
+  data.AppendBit(0);
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = false.
+  data.Clear();
+  gold = {};
+  sequence_header.film_grain_params_present = true;
+  frame_header.show_frame = true;
+  frame_header.showable_frame = true;
+  frame_header.frame_type = kFrameInter;
+  for (auto& index : frame_header.reference_frame_index) {
+    index = 1;
+  }
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  data.AppendBit(0);
+  gold.update_grain = false;
+  data.AppendLiteral(3, 1);
+  gold.reference_index = 1;
+  // Set up decoder_state_ with a previous frame containing saved film grain
+  // parameters.
+  decoder_state_.reference_frame[1] = buffer_pool_->GetFreeBuffer();
+  EXPECT_NE(decoder_state_.reference_frame[1], nullptr);
+  FilmGrainParams prev_grain_params = {};
+  prev_grain_params.apply_grain = true;
+  prev_grain_params.grain_seed = 11;
+  prev_grain_params.update_grain = true;
+  decoder_state_.reference_frame[1]->set_film_grain_params(prev_grain_params);
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = true, is_monochrome = true;
+  data.Clear();
+  gold = {};
+  frame_header.frame_type = kFrameKey;
+  for (auto& index : frame_header.reference_frame_index) {
+    index = 0;
+  }
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  gold.update_grain = true;
+  data.AppendLiteral(4, 10);
+  gold.num_y_points = 10;
+  for (int i = 0; i < gold.num_y_points; ++i) {
+    data.AppendLiteral(8, 2 * i);
+    gold.point_y_value[i] = 2 * i;
+    data.AppendLiteral(8, i);
+    gold.point_y_scaling[i] = i;
+  }
+  sequence_header.color_config.is_monochrome = true;
+  gold.chroma_scaling_from_luma = false;
+  gold.num_u_points = 0;
+  gold.num_v_points = 0;
+  data.AppendLiteral(2, 3);
+  gold.chroma_scaling = 11;
+  data.AppendLiteral(2, 1);
+  gold.auto_regression_coeff_lag = 1;
+  const int num_pos_luma =
+      2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+  for (int i = 0; i < num_pos_luma; ++i) {
+    data.AppendLiteral(8, i + 128);
+    gold.auto_regression_coeff_y[i] = i;
+  }
+  data.AppendLiteral(2, 0);
+  gold.auto_regression_shift = 6;
+  data.AppendLiteral(2, 1);
+  gold.grain_scale_shift = 1;
+  data.AppendBit(1);
+  gold.overlap_flag = true;
+  data.AppendBit(0);
+  gold.clip_to_restricted_range = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  ASSERT_TRUE(
+      obu_->frame_header().frame_type == kFrameInter ||
+      obu_->frame_header().film_grain_params.update_grain);  // a implies b.
+  VerifyFilmGrainParameters(gold);
+
+  // Test if update_grain = true, is_monochrome = false;
+  data.Clear();
+  gold = {};
+  frame_header.frame_type = kFrameKey;
+  data.AppendBit(1);
+  gold.apply_grain = true;
+  data.AppendLiteral(16, 8);
+  gold.grain_seed = 8;
+  gold.update_grain = true;
+  data.AppendLiteral(4, 10);
+  gold.num_y_points = 10;
+  for (int i = 0; i < gold.num_y_points; ++i) {
+    data.AppendLiteral(8, 2 * i);
+    gold.point_y_value[i] = 2 * i;
+    data.AppendLiteral(8, i);
+    gold.point_y_scaling[i] = i;
+  }
+  sequence_header.color_config.is_monochrome = false;
+  data.AppendBit(0);
+  gold.chroma_scaling_from_luma = false;
+  data.AppendLiteral(4, 5);
+  gold.num_u_points = 5;
+  for (int i = 0; i < gold.num_u_points; ++i) {
+    data.AppendLiteral(8, 2 * i + 1);
+    gold.point_u_value[i] = 2 * i + 1;
+    data.AppendLiteral(8, i);
+    gold.point_u_scaling[i] = i;
+  }
+  data.AppendLiteral(4, 3);
+  gold.num_v_points = 3;
+  for (int i = 0; i < gold.num_v_points; ++i) {
+    data.AppendLiteral(8, i);
+    gold.point_v_value[i] = i;
+    data.AppendLiteral(8, i + 1);
+    gold.point_v_scaling[i] = i + 1;
+  }
+  data.AppendLiteral(2, 3);
+  gold.chroma_scaling = 11;
+  data.AppendLiteral(2, 1);
+  gold.auto_regression_coeff_lag = 1;
+  const int num_pos_luma2 =
+      2 * gold.auto_regression_coeff_lag * (gold.auto_regression_coeff_lag + 1);
+  for (int i = 0; i < num_pos_luma2; ++i) {
+    data.AppendLiteral(8, i + 128);
+    gold.auto_regression_coeff_y[i] = i;
+  }
+  for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+    data.AppendLiteral(8, i);
+    gold.auto_regression_coeff_u[i] = i - 128;
+  }
+  for (int i = 0; i < num_pos_luma2 + 1; ++i) {
+    data.AppendLiteral(8, i);
+    gold.auto_regression_coeff_v[i] = i - 128;
+  }
+  data.AppendLiteral(2, 0);
+  gold.auto_regression_shift = 6;
+  data.AppendLiteral(2, 1);
+  gold.grain_scale_shift = 1;
+  data.AppendLiteral(8, 2);
+  gold.u_multiplier = -126;
+  data.AppendLiteral(8, 1);
+  gold.u_luma_multiplier = -127;
+  data.AppendLiteral(9, 3);
+  gold.u_offset = -253;
+  data.AppendLiteral(8, 3);
+  gold.v_multiplier = -125;
+  data.AppendLiteral(8, 2);
+  gold.v_luma_multiplier = -126;
+  data.AppendLiteral(9, 1);
+  gold.v_offset = -255;
+  data.AppendBit(1);
+  gold.overlap_flag = true;
+  data.AppendBit(0);
+  gold.clip_to_restricted_range = false;
+  ASSERT_TRUE(ParseFilmGrainParameters(data.GenerateData(), sequence_header,
+                                       frame_header));
+  ASSERT_TRUE(
+      obu_->frame_header().frame_type == kFrameInter ||
+      obu_->frame_header().film_grain_params.update_grain);  // a implies b.
+  VerifyFilmGrainParameters(gold);
+}
+
+TEST_F(ObuParserTest, TileInfoSyntax) {
+  BytesAndBits data;
+  TileInfo gold;
+  memset(&gold, 0, sizeof(gold));
+
+  gold.uniform_spacing = true;
+  gold.tile_columns_log2 = 1;
+  gold.tile_columns = 2;
+  gold.tile_rows_log2 = 1;
+  gold.tile_rows = 2;
+  gold.tile_count = 4;
+  gold.tile_column_start[1] = 64;
+  gold.tile_column_start[2] = 88;
+  gold.tile_row_start[1] = 64;
+  gold.tile_row_start[2] = 72;
+  gold.context_update_id = 3;
+  gold.tile_size_bytes = 4;
+  data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+  data.AppendBit(1);  // increment_tile_cols_log2.
+  data.AppendBit(0);  // increment_tile_cols_log2.
+  data.AppendBit(1);  // increment_tile_rows_log2.
+  data.AppendBit(0);  // increment_tile_rows_log2.
+  data.AppendBit(1);  // context update id, columns_log2+rows_log2 bits
+  data.AppendBit(1);
+  data.AppendLiteral(2, gold.tile_size_bytes - 1);
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  gold.uniform_spacing = false;
+  gold.tile_column_width_in_superblocks[0] = 2;
+  gold.tile_column_width_in_superblocks[1] = 1;
+  gold.tile_row_height_in_superblocks[0] = 2;
+  gold.tile_row_height_in_superblocks[1] = 1;
+
+  data.SetBit(0, static_cast<uint8_t>(gold.uniform_spacing));
+  // The next 4 bits remain the same except now they represent f(w - 1) and
+  // extra_bit in DecodeUniform. All the subsequent bits are unchanged the
+  // represent the same thing as above.
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  // No tiles.
+  memset(&gold, 0, sizeof(gold));
+  gold.uniform_spacing = true;
+  gold.tile_columns = 1;
+  gold.tile_rows = 1;
+  gold.tile_count = 1;
+  gold.tile_column_start[1] = 88;
+  gold.tile_row_start[1] = 72;
+  data.Clear();
+  data.AppendBit(static_cast<uint8_t>(gold.uniform_spacing));
+  data.AppendBit(0);  // tile_cols_log2.
+  data.AppendBit(0);  // tile_rows_log2.
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 88, 72, true));
+  VerifyTileInfoParameters(gold);
+
+  // 64x64 superblocks. No tiles.
+  gold.tile_column_start[1] = 640;
+  gold.tile_row_start[1] = 360;
+
+  ASSERT_TRUE(ParseTileInfoSyntax(data.GenerateData(), 640, 360, false));
+  VerifyTileInfoParameters(gold);
+}
+
+TEST_F(ObuParserTest, MetadataUnknownType) {
+  BytesAndBits data;
+  // The metadata_type 10 is a user private value (6-31).
+  data.AppendLiteral(8, 10);  // metadata_type.
+  // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU if
+  // they do not understand the metadata_type."
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataCll) {
+  BytesAndBits data;
+  ObuMetadata gold;
+  gold.max_cll = 25;
+  gold.max_fall = 100;
+
+  data.AppendLiteral(8, kMetadataTypeHdrContentLightLevel);
+  data.AppendLiteral(16, gold.max_cll);
+  data.AppendLiteral(16, gold.max_fall);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadata(kMetadataTypeHdrContentLightLevel, gold);
+}
+
+TEST_F(ObuParserTest, MetadataMdcv) {
+  BytesAndBits data;
+  ObuMetadata gold;
+  for (int i = 0; i < 3; ++i) {
+    gold.primary_chromaticity_x[i] = 0;
+    gold.primary_chromaticity_y[i] = 0;
+  }
+  gold.white_point_chromaticity_x = 250;
+  gold.white_point_chromaticity_y = 2500;
+  gold.luminance_max = 6000;
+  gold.luminance_min = 3000;
+
+  data.AppendLiteral(8, kMetadataTypeHdrMasteringDisplayColorVolume);
+  for (int i = 0; i < 3; ++i) {
+    data.AppendLiteral(16, gold.primary_chromaticity_x[i]);
+    data.AppendLiteral(16, gold.primary_chromaticity_y[i]);
+  }
+  data.AppendLiteral(16, gold.white_point_chromaticity_x);
+  data.AppendLiteral(16, gold.white_point_chromaticity_y);
+  data.AppendLiteral(32, gold.luminance_max);
+  data.AppendLiteral(32, gold.luminance_min);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadata(kMetadataTypeHdrMasteringDisplayColorVolume, gold);
+}
+
+TEST_F(ObuParserTest, MetadataScalability) {
+  BytesAndBits data;
+  ObuMetadata gold;
+
+  data.AppendLiteral(8, kMetadataTypeScalability);
+  data.AppendLiteral(8, 0);  // scalability_mode_idc
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadata(kMetadataTypeScalability, gold);
+}
+
+TEST_F(ObuParserTest, MetadataItutT35) {
+  BytesAndBits data;
+  ObuMetadata gold;
+  gold.itu_t_t35_country_code = 0xA6;  // 1 0 1 0 0 1 1 0 Switzerland
+  gold.itu_t_t35_country_code_extension_byte = 0;
+  gold.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[10]);
+  ASSERT_NE(gold.itu_t_t35_payload_bytes, nullptr);
+  for (int i = 0; i < 10; ++i) {
+    gold.itu_t_t35_payload_bytes[i] = 9 - i;
+  }
+  gold.itu_t_t35_payload_size = 10;
+
+  data.AppendLiteral(8, kMetadataTypeItutT35);
+  data.AppendLiteral(8, gold.itu_t_t35_country_code);
+  for (int i = 0; i < 10; ++i) {
+    data.AppendLiteral(8, 9 - i);
+  }
+  // For the kMetadataTypeItutT35 metadata type, we must include the trailing
+  // bit so that the end of the itu_t_t35_payload_bytes can be identified.
+  data.AppendLiteral(8, 0x80);
+  data.AppendLiteral(8, 0x00);
+  data.AppendLiteral(8, 0x00);
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadata(kMetadataTypeItutT35, gold);
+}
+
+TEST_F(ObuParserTest, MetadataTimecode) {
+  BytesAndBits data;
+  ObuMetadata gold;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  ASSERT_TRUE(ParseMetadata(data.GenerateData()));
+  VerifyMetadata(kMetadataTypeTimecode, gold);
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidSecondsValue) {
+  BytesAndBits data;
+  ObuMetadata gold;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 60);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidMinutesValue) {
+  BytesAndBits data;
+  ObuMetadata gold;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 60);  // minutes_value
+  data.AppendLiteral(5, 23);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+TEST_F(ObuParserTest, MetadataTimecodeInvalidHoursValue) {
+  BytesAndBits data;
+  ObuMetadata gold;
+
+  data.AppendLiteral(8, kMetadataTypeTimecode);
+  data.AppendLiteral(5, 0);   // counting_type
+  data.AppendBit(1);          // full_timestamp_flag
+  data.AppendBit(0);          // discontinuity_flag
+  data.AppendBit(0);          // cnt_dropped_flag
+  data.AppendLiteral(9, 8);   // n_frames
+  data.AppendLiteral(6, 59);  // seconds_value
+  data.AppendLiteral(6, 59);  // minutes_value
+  data.AppendLiteral(5, 24);  // hours_value
+  data.AppendLiteral(5, 0);   // time_offset_length
+
+  EXPECT_FALSE(ParseMetadata(data.GenerateData()));
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter.h b/src/post_filter.h
new file mode 100644
index 0000000..a247075
--- /dev/null
+++ b/src/post_filter.h
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_POST_FILTER_H_
+#define LIBGAV1_SRC_POST_FILTER_H_
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include "src/dsp/common.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/threadpool.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// This class applies in-loop filtering for each frame after it is
+// reconstructed. The in-loop filtering contains all post processing filtering
+// for the reconstructed frame, including deblock filter, CDEF, superres,
+// and loop restoration.
+// Historically, for example in libaom, loop filter refers to deblock filter.
+// To avoid name conflicts, we call this class PostFilter (post processing).
+// In-loop post filtering order is:
+// deblock --> CDEF --> super resolution--> loop restoration.
+// When CDEF and super resolution is not used, we can combine deblock
+// and restoration together to only filter frame buffer once.
+class PostFilter {
+ public:
+  // This class does not take ownership of the masks/restoration_info, but it
+  // may change their values.
+  //
+  // The overall flow of data in this class (for both single and multi-threaded
+  // cases) is as follows:
+  //   -> Input: |frame_buffer_|.
+  //   -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
+  //      |loop_restoration_buffer_|.
+  //   -> Deblocking:
+  //      * Input: |source_buffer_|
+  //      * Output: |source_buffer_|
+  //   -> CDEF:
+  //      * Input: |source_buffer_|
+  //      * Output: |cdef_buffer_|
+  //   -> SuperRes:
+  //      * Input: |cdef_buffer_|
+  //      * Output: |superres_buffer_|
+  //   -> Loop Restoration:
+  //      * Input: |superres_buffer_|
+  //      * Output: |loop_restoration_buffer_|.
+  //   -> Now |frame_buffer_| contains the filtered frame.
+  PostFilter(const ObuFrameHeader& frame_header,
+             const ObuSequenceHeader& sequence_header,
+             FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer,
+             const dsp::Dsp* dsp, int do_post_filter_mask);
+
+  // non copyable/movable.
+  PostFilter(const PostFilter&) = delete;
+  PostFilter& operator=(const PostFilter&) = delete;
+  PostFilter(PostFilter&&) = delete;
+  PostFilter& operator=(PostFilter&&) = delete;
+
+  // The overall function that applies all post processing filtering with
+  // multiple threads.
+  // * The filtering order is:
+  //   deblock --> CDEF --> super resolution--> loop restoration.
+  // * The output of each filter is the input for the following filter. A
+  //   special case is that loop restoration needs a few rows of the deblocked
+  //   frame and the entire cdef filtered frame:
+  //   deblock --> CDEF --> super resolution --> loop restoration.
+  //              |                                 ^
+  //              |                                 |
+  //              -----------> super resolution -----
+  // * Any of these filters could be present or absent.
+  // * |frame_buffer_| points to the decoded frame buffer. When
+  //   ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each
+  //   of the filters as described below.
+  // Filter behavior (multi-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+  //         the input and the output is written into |cdef_buffer_| (which is
+  //         the same as |source_buffer_|).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+  //             |superres_line_buffer_| as the input and the output is written
+  //             into |superres_buffer_| (which is just |cdef_buffer_| with a
+  //             shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left).
+  void ApplyFilteringThreaded();
+
+  // Does the overall post processing filter for one superblock row starting at
+  // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter
+  // will not be applied.
+  //
+  // Filter behavior (single-threaded):
+  // * Deblock: In-place filtering. The output is written to |source_buffer_|.
+  //            If cdef and loop restoration are both on, then 4 rows (as
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
+  //         (which is just |source_buffer_| with a shift to the top-left).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+  //             and the output is written into |superres_buffer_| (which is
+  //             just |cdef_buffer_| with a shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left or top-left).
+  // Returns the index of the last row whose post processing is complete and can
+  // be used for referencing.
+  int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
+                                        bool do_deblock);
+
+  // Apply deblocking filter in one direction (specified by |loop_filter_type|)
+  // for the superblock row starting at |row4x4_start| for columns starting from
+  // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling)
+  // until the smallest multiple of 16 that is >= |column4x4_end| or until
+  // |frame_header_.columns4x4|, whichever is lower. This function must be
+  // called only if |DoDeblock()| returns true.
+  void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start,
+                          int column4x4_start, int column4x4_end, int sb4x4);
+
+  static bool DoCdef(const ObuFrameHeader& frame_header,
+                     int do_post_filter_mask) {
+    return (frame_header.cdef.bits > 0 ||
+            frame_header.cdef.y_primary_strength[0] > 0 ||
+            frame_header.cdef.y_secondary_strength[0] > 0 ||
+            frame_header.cdef.uv_primary_strength[0] > 0 ||
+            frame_header.cdef.uv_secondary_strength[0] > 0) &&
+           (do_post_filter_mask & 0x02) != 0;
+  }
+  bool DoCdef() const { return do_cdef_; }
+  // If filter levels for Y plane (0 for vertical, 1 for horizontal),
+  // are all zero, deblock filter will not be applied.
+  static bool DoDeblock(const ObuFrameHeader& frame_header,
+                        uint8_t do_post_filter_mask) {
+    return (frame_header.loop_filter.level[0] > 0 ||
+            frame_header.loop_filter.level[1] > 0) &&
+           (do_post_filter_mask & 0x01) != 0;
+  }
+  bool DoDeblock() const { return do_deblock_; }
+
+  uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
+                                         ReferenceFrameType type,
+                                         int mode_id) const {
+    return deblock_filter_levels_[segment_id][level_index][type][mode_id];
+  }
+  // Computes the deblock filter levels using |delta_lf| and stores them in
+  // |deblock_filter_levels|.
+  void ComputeDeblockFilterLevels(
+      const int8_t delta_lf[kFrameLfCount],
+      uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                   [kNumReferenceFrameTypes][2]) const;
+  // Returns true if loop restoration will be performed for the given parameters
+  // and mask.
+  static bool DoRestoration(const LoopRestoration& loop_restoration,
+                            uint8_t do_post_filter_mask, int num_planes) {
+    if (num_planes == kMaxPlanesMonochrome) {
+      return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone &&
+             (do_post_filter_mask & 0x08) != 0;
+    }
+    return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone ||
+            loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) &&
+           (do_post_filter_mask & 0x08) != 0;
+  }
+  bool DoRestoration() const { return do_restoration_; }
+
+  // Returns a pointer to the unfiltered buffer. This is used by the Tile class
+  // to determine where to write the output of the tile decoding process taking
+  // in-place filtering offsets into consideration.
+  uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; }
+  const YuvBuffer& frame_buffer() const { return frame_buffer_; }
+
+  // Returns true if SuperRes will be performed for the given frame header and
+  // mask.
+  static bool DoSuperRes(const ObuFrameHeader& frame_header,
+                         uint8_t do_post_filter_mask) {
+    return frame_header.width != frame_header.upscaled_width &&
+           (do_post_filter_mask & 0x04) != 0;
+  }
+  bool DoSuperRes() const { return do_superres_; }
+  LoopRestorationInfo* restoration_info() const { return restoration_info_; }
+  uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
+                           int row, int column) const {
+    return base_buffer + (row >> subsampling_y_[plane]) * stride +
+           ((column >> subsampling_x_[plane]) << pixel_size_log2_);
+  }
+  uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+  uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
+  }
+
+  template <typename Pixel>
+  static void ExtendFrame(Pixel* frame_start, int width, int height,
+                          ptrdiff_t stride, int left, int right, int top,
+                          int bottom);
+
+ private:
+  // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member
+  // functions.
+  using DeblockFilter = void (PostFilter::*)(int row4x4_start, int row4x4_end,
+                                             int column4x4_start,
+                                             int column4x4_end);
+  // Functions common to all post filters.
+
+  // Extends the frame by setting the border pixel values to the one from its
+  // closest frame boundary.
+  void ExtendFrameBoundary(uint8_t* frame_start, int width, int height,
+                           ptrdiff_t stride, int left, int right, int top,
+                           int bottom) const;
+  // Extend frame boundary for referencing if the frame will be saved as a
+  // reference frame.
+  void ExtendBordersForReferenceFrame();
+  // Copies the deblocked pixels needed for loop restoration.
+  void CopyDeblockedPixels(Plane plane, int row4x4);
+  // Copies the border for one superblock row. If |for_loop_restoration| is
+  // true, then it assumes that the border extension is being performed for the
+  // input of the loop restoration process. If |for_loop_restoration| is false,
+  // then it assumes that the border extension is being performed for using the
+  // current frame as a reference frame. In this case, |progress_row_| is also
+  // updated.
+  void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                      bool for_loop_restoration);
+  // Sets up the |loop_restoration_border_| for loop restoration.
+  // This is called when there is no CDEF filter. We copy rows from
+  // |superres_buffer_| and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start);
+  // This is called when there is CDEF filter. We copy rows from
+  // |source_buffer_|, apply superres and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
+  // Returns true if we can perform border extension in loop (i.e.) without
+  // waiting until the entire frame is decoded. If intra_block_copy is true, we
+  // do in-loop border extension only if the upscaled_width is the same as 4 *
+  // columns4x4. Otherwise, we cannot do in loop border extension since those
+  // pixels may be used by intra block copy.
+  bool DoBorderExtensionInLoop() const {
+    return !frame_header_.allow_intrabc ||
+           frame_header_.upscaled_width ==
+               MultiplyBy4(frame_header_.columns4x4);
+  }
+  template <typename Pixel>
+  void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
+                 Pixel* dst, ptrdiff_t dst_stride) {
+    assert(height > 0);
+    do {
+      memcpy(dst, src, width * sizeof(Pixel));
+      src += src_stride;
+      dst += dst_stride;
+    } while (--height != 0);
+  }
+
+  // Worker function used for multi-threaded implementation of Deblocking, CDEF
+  // and Loop Restoration.
+  using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+  // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+  // thread and returns once all the jobs are completed.
+  void RunJobs(WorkerFunction worker);
+
+  // Functions for the Deblocking filter.
+
+  bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                          uint8_t* level, int* step,
+                                          int* filter_length) const;
+  void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4,
+                                            uint8_t* level_u, uint8_t* level_v,
+                                            int* step,
+                                            int* filter_length) const;
+  bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                        BlockParameters* const* bp_ptr,
+                                        uint8_t* level, int* step,
+                                        int* filter_length) const;
+  void GetVerticalDeblockFilterEdgeInfoUV(int column4x4,
+                                          BlockParameters* const* bp_ptr,
+                                          uint8_t* level_u, uint8_t* level_v,
+                                          int* step, int* filter_length) const;
+  void HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+                               int column4x4_start, int column4x4_end);
+  void VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+                             int column4x4_start, int column4x4_end);
+  // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct
+  // signature.
+  static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter),
+                             DeblockFilter>::value,
+                "");
+  // Worker function used for multi-threaded deblocking.
+  template <LoopFilterType loop_filter_type>
+  void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+          WorkerFunction>::value,
+      "");
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+          WorkerFunction>::value,
+      "");
+
+  // Functions for the cdef filter.
+
+  // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+  // implementation into |cdef_border_|.
+  void SetupCdefBorder(int row4x4);
+  // This function prepares the input source block for cdef filtering. The input
+  // source block contains a 12x12 block, with the inner 8x8 as the desired
+  // filter region. It pads the block if the 12x12 block includes out of frame
+  // pixels with a large value. This achieves the required behavior defined in
+  // section 5.11.52 of the spec.
+  template <typename Pixel>
+  void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
+                        int column4x4, uint16_t* cdef_source,
+                        ptrdiff_t cdef_stride, bool y_plane,
+                        const uint8_t border_columns[kMaxPlanes][256],
+                        bool use_border_columns);
+  // Applies cdef for one 64x64 block.
+  template <typename Pixel>
+  void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
+                           int block_height4x4, int row4x4_start,
+                           int column4x4_start,
+                           uint8_t border_columns[2][kMaxPlanes][256],
+                           bool use_border_columns[2][2]);
+  // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
+  // duplication.
+  void ApplyCdefForOneSuperBlockRowHelper(
+      uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+      int row4x4, int block_height4x4);
+  // Applies CDEF filtering for the superblock row starting at |row4x4| with a
+  // height of 4*|sb4x4|.
+  void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
+  // Worker function used for multi-threaded CDEF.
+  void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+                             WorkerFunction>::value,
+                "");
+
+  // Functions for the SuperRes filter.
+
+  // Applies super resolution for the |src| for |rows[plane]| rows of each
+  // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+  // be processed, the line buffer indicated by |line_buffer_row| will be used
+  // as the source. If |dst_is_loop_restoration_border| is true, then it means
+  // that the |dst| pointers come from |loop_restoration_border_| and the
+  // strides will be populated from that buffer.
+  void ApplySuperRes(
+      const std::array<uint8_t*, kMaxPlanes>& src,
+      const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+      const std::array<uint8_t*, kMaxPlanes>& dst,
+      bool dst_is_loop_restoration_border = false);  // Section 7.16.
+  // Applies SuperRes for the superblock row starting at |row4x4| with a height
+  // of 4*|sb4x4|.
+  void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
+                                        bool is_last_row);
+  void ApplySuperResThreaded();
+
+  // Functions for the Loop Restoration filter.
+
+  // Notes about Loop Restoration:
+  // (1). Loop restoration processing unit size is default to 64x64.
+  // Only when the remaining filtering area is smaller than 64x64, the
+  // processing unit size is the actual area size.
+  // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y).
+  // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y
+  // plane. The unit size for chroma can be the same or half, depending on
+  // subsampling. If either subsampling_x or subsampling_y is one, unit size
+  // is halved on both x and y sides.
+  // All loop restoration units have the same size for one plane.
+  // One loop restoration unit could contain multiple processing units.
+  // But they share the same sets of loop restoration parameters.
+  // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The
+  // size of first row of loop restoration units and processing units is
+  // shrunk by the offset.
+  // (4). Loop restoration units wrap the bottom and the right of the frame,
+  // if the remaining area is small. The criteria is whether the number of
+  // remaining rows/columns is smaller than half of loop restoration unit
+  // size.
+  // For example, if the frame size is 140x140, loop restoration unit size is
+  // 128x128. The size of the first loop restoration unit is 128x(128-8) =
+  // 128 columns x 120 rows.
+  // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop
+  // restoration unit. Similarly, the remaining 12 columns will also be folded
+  // to current loop restoration unit. So, even frame size is 140x140,
+  // there's only one loop restoration unit. Suppose processing unit is 64x64,
+  // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
+  // respectively. The second row is 64x64, 64x64, 12x64.
+  // The third row is 64x20, 64x20, 12x20.
+
+  // |stride| is shared by |src_buffer| and |dst_buffer|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+                                     Plane plane, int plane_height,
+                                     int plane_width, int y, int unit_row,
+                                     int current_process_unit_height,
+                                     int plane_unit_size, Pixel* dst_buffer);
+  // Applies loop restoration for the superblock row starting at |row4x4_start|
+  // with a height of 4*|sb4x4|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+  // Helper function that calls the right variant of
+  // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+  void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+  // Worker function used for multithreaded Loop Restoration.
+  void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+                             WorkerFunction>::value,
+                "");
+
+  // The lookup table for picking the deblock filter, according to deblock
+  // filter type.
+  const DeblockFilter deblock_filter_func_[2] = {
+      &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter};
+  const ObuFrameHeader& frame_header_;
+  const LoopRestoration& loop_restoration_;
+  const dsp::Dsp& dsp_;
+  const int8_t bitdepth_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+  const int8_t planes_;
+  const int pixel_size_log2_;
+  const uint8_t* const inner_thresh_;
+  const uint8_t* const outer_thresh_;
+  const bool needs_chroma_deblock_;
+  const bool do_cdef_;
+  const bool do_deblock_;
+  const bool do_restoration_;
+  const bool do_superres_;
+  // This stores the deblocking filter levels assuming that the delta is zero.
+  // This will be used by all superblocks whose delta is zero (without having to
+  // recompute them). The dimensions (in order) are: segment_id, level_index
+  // (based on plane and direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+  // Stores the SuperRes info for the frame.
+  struct {
+    int upscaled_width;
+    int initial_subpixel_x;
+    int step;
+  } super_res_info_[kMaxPlanes];
+  const Array2D<int8_t>& cdef_index_;
+  const Array2D<uint8_t>& cdef_skip_;
+  const Array2D<TransformSize>& inter_transform_sizes_;
+  LoopRestorationInfo* const restoration_info_;
+  uint8_t* const superres_coefficients_[kNumPlaneTypes];
+  // Line buffer used by multi-threaded ApplySuperRes().
+  // In the multi-threaded case, this buffer will store the last downscaled row
+  // input of each thread to avoid overwrites by the first upscaled row output
+  // of the thread below it.
+  YuvBuffer& superres_line_buffer_;
+  const BlockParametersHolder& block_parameters_;
+  // Frame buffer to hold cdef filtered frame.
+  YuvBuffer cdef_filtered_buffer_;
+  // Input frame buffer.
+  YuvBuffer& frame_buffer_;
+  // A view into |frame_buffer_| that points to the input and output of the
+  // deblocking process.
+  uint8_t* source_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the CDEF filtered
+  // planes (to facilitate in-place CDEF filtering).
+  uint8_t* cdef_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the planes after the SuperRes
+  // filter is applied (to facilitate in-place SuperRes).
+  uint8_t* superres_buffer_[kMaxPlanes];
+  // A view into |frame_buffer_| that points to the output of the Loop Restored
+  // planes (to facilitate in-place Loop Restoration).
+  uint8_t* loop_restoration_buffer_[kMaxPlanes];
+  YuvBuffer& cdef_border_;
+  // Buffer used to store the border pixels that are necessary for loop
+  // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
+  // for every 32x32 for chroma with subsampling). The indices of the rows that
+  // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+  // this buffer are never populated and never used.
+  // This buffer is used only when both of the following conditions are true:
+  //   (1). Loop Restoration is on.
+  //   (2). Cdef is on, or multi-threading is enabled for post filter.
+  YuvBuffer& loop_restoration_border_;
+  ThreadPool* const thread_pool_;
+
+  // Tracks the progress of the post filters.
+  int progress_row_ = -1;
+
+  // A block buffer to hold the input that is converted to uint16_t before
+  // cdef filtering. Only used in single threaded case. Y plane is processed
+  // separately. U and V planes are processed together. So it is sufficient to
+  // have this buffer to accommodate 2 planes at a time.
+  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterSuperResTest;
+
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterHelperFuncTest;
+};
+
+extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start,
+                                                      int width, int height,
+                                                      ptrdiff_t stride,
+                                                      int left, int right,
+                                                      int top, int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
+                                                       int width, int height,
+                                                       ptrdiff_t stride,
+                                                       int left, int right,
+                                                       int top, int bottom);
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_POST_FILTER_H_
diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc
new file mode 100644
index 0000000..037fc17
--- /dev/null
+++ b/src/post_filter/cdef.cc
@@ -0,0 +1,674 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStep64x64 = 16;  // =64/4.
+constexpr int kCdefSkip = 8;
+
+constexpr uint8_t kCdefUvDirection[2][2][8] = {
+    {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
+    {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
+template <typename Pixel>
+void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
+                    bool is_frame_left, bool is_frame_right,
+                    uint16_t* const dst, const Pixel* left_border = nullptr) {
+  if (sizeof(src[0]) == sizeof(dst[0])) {
+    if (is_frame_left) {
+      Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
+    } else if (left_border == nullptr) {
+      memcpy(dst - kCdefBorder, src - kCdefBorder,
+             kCdefBorder * sizeof(dst[0]));
+    } else {
+      memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
+    }
+    memcpy(dst, src, block_width * sizeof(dst[0]));
+    if (is_frame_right) {
+      Memset(dst + block_width, kCdefLargeValue,
+             unit_width + kCdefBorder - block_width);
+    } else {
+      memcpy(dst + block_width, src + block_width,
+             (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
+    }
+    return;
+  }
+  if (is_frame_left) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+    }
+  } else if (left_border == nullptr) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = src[x];
+    }
+  } else {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = left_border[x + kCdefBorder];
+    }
+  }
+  for (int x = 0; x < block_width; ++x) {
+    dst[x] = src[x];
+  }
+  for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
+    dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+  }
+}
+
+// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
+// |dst|.
+void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
+                int dst_stride, int width, int height, size_t pixel_size) {
+  int y = height;
+  do {
+    memcpy(dst, src, width * pixel_size);
+    src += src_stride;
+    dst += dst_stride;
+  } while (--y != 0);
+}
+
+}  // namespace
+
+void PostFilter::SetupCdefBorder(int row4x4) {
+  assert(row4x4 >= 0);
+  assert(DoCdef());
+  int plane = kPlaneY;
+  do {
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels = SubsampledValue(
+        MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+                                             subsampling_y_[plane]);
+    for (int i = 0; i < 4; ++i) {
+      const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+      const int absolute_row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+      if (absolute_row >= plane_height) break;
+      const uint8_t* src =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+          row * src_stride;
+      uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+      memcpy(dst, src, row_width);
+    }
+  } while (++plane < planes_);
+}
+
+template <typename Pixel>
+void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
+                                  int row4x4, int column4x4,
+                                  uint16_t* cdef_source, ptrdiff_t cdef_stride,
+                                  const bool y_plane,
+                                  const uint8_t border_columns[kMaxPlanes][256],
+                                  bool use_border_columns) {
+  assert(y_plane || planes_ == kMaxPlanes);
+  const int max_planes = y_plane ? 1 : kMaxPlanes;
+  const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
+  const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
+  const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
+  const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
+  const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
+  const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
+  const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
+  const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
+  // unit_width, unit_height are the same as block_width, block_height unless
+  // it reaches the frame boundary, where block_width < 64 or
+  // block_height < 64. unit_width, unit_height guarantee we build blocks on
+  // a multiple of 8.
+  const int unit_width = Align(block_width, 8 >> subsampling_x);
+  const int unit_height = Align(block_height, 8 >> subsampling_y);
+  const bool is_frame_left = column4x4 == 0;
+  const bool is_frame_right = start_x + block_width >= plane_width;
+  const bool is_frame_top = row4x4 == 0;
+  const bool is_frame_bottom = start_y + block_height >= plane_height;
+  const int y_offset = is_frame_top ? 0 : kCdefBorder;
+  const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
+
+  for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
+    uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+                                           kCdefUnitSizeWithBorders *
+                                           kCdefUnitSizeWithBorders;
+    const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const Pixel* src_buffer =
+        reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
+        (start_y - y_offset) * src_stride + start_x;
+    const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+    const Pixel* cdef_border =
+        (thread_pool_ == nullptr)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+                  cdef_border_row_offset * cdef_border_stride + start_x;
+
+    // All the copying code will use negative indices for populating the left
+    // border. So the starting point is set to kCdefBorder.
+    cdef_src += kCdefBorder;
+
+    // Copy the top 2 rows as follows;
+    // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    if (is_frame_top) {
+      for (int y = 0; y < kCdefBorder; ++y) {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      }
+    } else {
+      const Pixel* top_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int top_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      for (int y = 0; y < kCdefBorder; ++y) {
+        CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        top_border += top_border_stride;
+        cdef_src += cdef_stride;
+        // We need to increment |src_buffer| and |cdef_border| in this loop to
+        // set them up for the subsequent loops below.
+        src_buffer += src_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the body as follows;
+    // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+    // rows are copied from |src_buffer|.
+    // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+    // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
+    int y = block_height;
+    const int y_threshold =
+        (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+    const Pixel* left_border =
+        (thread_pool_ == nullptr || !use_border_columns)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(border_columns[plane]);
+    do {
+      CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+                     is_frame_right, cdef_src, left_border);
+      cdef_src += cdef_stride;
+      src_buffer += src_stride;
+      if (left_border != nullptr) left_border += kCdefBorder;
+    } while (--y != y_threshold);
+
+    if (y > 0) {
+      assert(y == kCdefBorder);
+      // |cdef_border| now points to the top 2 rows of the current block. For
+      // the next loop, we need it to point to the bottom 2 rows of the
+      // current block. So increment it by 2 rows.
+      cdef_border += MultiplyBy2(cdef_border_stride);
+      for (int i = 0; i < kCdefBorder; ++i) {
+        CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        cdef_src += cdef_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the bottom 2 rows as follows;
+    // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    y = 0;
+    if (is_frame_bottom) {
+      do {
+        Memset(cdef_src - kCdefBorder, kCdefLargeValue,
+               unit_width + 2 * kCdefBorder);
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    } else {
+      const Pixel* bottom_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int bottom_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
+      do {
+        CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        bottom_border += bottom_border_stride;
+        cdef_src += cdef_stride;
+      } while (++y < kCdefBorder + unit_height - block_height);
+    }
+  }
+}
+
+template <typename Pixel>
+void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
+                                     const int block_width4x4,
+                                     const int block_height4x4,
+                                     const int row4x4_start,
+                                     const int column4x4_start,
+                                     uint8_t border_columns[2][kMaxPlanes][256],
+                                     bool use_border_columns[2][2]) {
+  // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
+  static constexpr int kStep = 8;
+  static constexpr int kStep4x4 = 2;
+
+  int cdef_buffer_row_base_stride[kMaxPlanes];
+  uint8_t* cdef_buffer_row_base[kMaxPlanes];
+  int src_buffer_row_base_stride[kMaxPlanes];
+  const uint8_t* src_buffer_row_base[kMaxPlanes];
+  const uint16_t* cdef_src_row_base[kMaxPlanes];
+  int cdef_src_row_base_stride[kMaxPlanes];
+  int column_step[kMaxPlanes];
+  assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
+  int plane = kPlaneY;
+  do {
+    cdef_buffer_row_base[plane] =
+        GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
+    cdef_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+                                                 row4x4_start, column4x4_start);
+    src_buffer_row_base_stride[plane] =
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    cdef_src_row_base[plane] =
+        cdef_block +
+        static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+            kCdefUnitSizeWithBorders +
+        kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+    cdef_src_row_base_stride[plane] =
+        kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
+    column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
+  } while (++plane < planes_);
+
+  // |border_columns| contains two buffers. In each call to this function, we
+  // will use one of them as the "destination" for the current call. And the
+  // other one as the "source" for the current call (which would have been the
+  // "destination" of the previous call). We will use the src_index to populate
+  // the borders which were backed up in the previous call. We will use the
+  // dst_index to populate the borders to be used in the next call.
+  const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+  const int border_columns_dst_index = border_columns_src_index ^ 1;
+
+  if (index == -1) {
+    if (thread_pool_ == nullptr) {
+      int plane = kPlaneY;
+      do {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      } while (++plane < planes_);
+    }
+    use_border_columns[border_columns_dst_index][0] = false;
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  const bool is_frame_right =
+      MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    // Backup the last 2 columns for use in the next iteration.
+    use_border_columns[border_columns_dst_index][0] = true;
+    const uint8_t* src_line =
+        GetSourceBuffer(kPlaneY, row4x4_start,
+                        column4x4_start + block_width4x4) -
+        kCdefBorder * sizeof(Pixel);
+    CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+               border_columns[border_columns_dst_index][kPlaneY],
+               kCdefBorder * sizeof(Pixel), kCdefBorder,
+               MultiplyBy4(block_height4x4), sizeof(Pixel));
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, true,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][0]);
+
+  // Stored direction used during the u/v pass.  If bit 3 is set, then block is
+  // a skip.
+  uint8_t direction_y[8 * 8];
+  int y_index = 0;
+
+  const uint8_t y_primary_strength =
+      frame_header_.cdef.y_primary_strength[index];
+  const uint8_t y_secondary_strength =
+      frame_header_.cdef.y_secondary_strength[index];
+  // y_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only. This will be updated
+  // with y_primary_strength after variance is applied.
+  int y_strength_index = static_cast<int>(y_secondary_strength == 0);
+
+  const bool compute_direction_and_variance =
+      (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
+  const uint8_t* skip_row =
+      &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
+  const int skip_stride = cdef_skip_.columns();
+  int row4x4 = row4x4_start;
+  do {
+    uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
+    const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+    const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
+    int column4x4 = column4x4_start;
+
+    if (*skip_row == 0) {
+      for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
+        direction_y[y_index] = kCdefSkip;
+      }
+      if (thread_pool_ == nullptr) {
+        CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
+                   cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
+                   sizeof(Pixel));
+      }
+    } else {
+      do {
+        const int block_width = kStep;
+        const int block_height = kStep;
+        const int cdef_stride = frame_buffer_.stride(kPlaneY);
+        uint8_t* const cdef_buffer = cdef_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
+        const int src_stride = frame_buffer_.stride(kPlaneY);
+        const uint8_t* const src_buffer = src_buffer_base;
+
+        const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
+        const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
+        if (skip) {  // No cdef filtering.
+          direction_y[y_index] = kCdefSkip;
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          // Zero out residual skip flag.
+          direction_y[y_index] = 0;
+
+          int variance = 0;
+          if (compute_direction_and_variance) {
+            if (thread_pool_ == nullptr ||
+                row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+              dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+                                  &variance);
+            } else if (sizeof(Pixel) == 2) {
+              dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+                                  &direction_y[y_index], &variance);
+            } else {
+              // If we are in the last row4x4 for this unit, then the last two
+              // input rows have to come from |cdef_border_|. Since we already
+              // have |cdef_src| populated correctly, use that as the input
+              // for the direction process.
+              uint8_t direction_src[8][8];
+              const uint16_t* cdef_src_line = cdef_src;
+              for (auto& direction_src_line : direction_src) {
+                for (int i = 0; i < 8; ++i) {
+                  direction_src_line[i] = cdef_src_line[i];
+                }
+                cdef_src_line += kCdefUnitSizeWithBorders;
+              }
+              dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+                                  &variance);
+            }
+          }
+          const int direction =
+              (y_primary_strength == 0) ? 0 : direction_y[y_index];
+          const int variance_strength =
+              ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
+                                     : 0;
+          const uint8_t primary_strength =
+              (variance != 0)
+                  ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
+                  : 0;
+          if ((primary_strength | y_secondary_strength) == 0) {
+            if (thread_pool_ == nullptr) {
+              CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                         block_width, block_height, sizeof(Pixel));
+            }
+          } else {
+            const int strength_index =
+                y_strength_index |
+                (static_cast<int>(primary_strength == 0) << 1);
+            dsp_.cdef_filters[1][strength_index](
+                cdef_src, kCdefUnitSizeWithBorders, block_height,
+                primary_strength, y_secondary_strength,
+                frame_header_.cdef.damping, direction, cdef_buffer,
+                cdef_stride);
+          }
+        }
+        cdef_buffer_base += column_step[kPlaneY];
+        src_buffer_base += column_step[kPlaneY];
+        cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
+
+        column4x4 += kStep4x4;
+        y_index++;
+      } while (column4x4 < column4x4_start + block_width4x4);
+    }
+
+    cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
+    src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+    cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
+    skip_row += skip_stride;
+    row4x4 += kStep4x4;
+  } while (row4x4 < row4x4_start + block_height4x4);
+
+  if (planes_ == kMaxPlanesMonochrome) {
+    return;
+  }
+
+  const uint8_t uv_primary_strength =
+      frame_header_.cdef.uv_primary_strength[index];
+  const uint8_t uv_secondary_strength =
+      frame_header_.cdef.uv_secondary_strength[index];
+
+  if ((uv_primary_strength | uv_secondary_strength) == 0) {
+    if (thread_pool_ == nullptr) {
+      for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      }
+    }
+    use_border_columns[border_columns_dst_index][1] = false;
+    return;
+  }
+
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    use_border_columns[border_columns_dst_index][1] = true;
+    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+      // Backup the last 2 columns for use in the next iteration.
+      const uint8_t* src_line =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+                          column4x4_start + block_width4x4) -
+          kCdefBorder * sizeof(Pixel);
+      CopyPixels(src_line, frame_buffer_.stride(plane),
+                 border_columns[border_columns_dst_index][plane],
+                 kCdefBorder * sizeof(Pixel), kCdefBorder,
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 sizeof(Pixel));
+    }
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, false,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][1]);
+
+  // uv_strength_index is 0 for both primary and secondary strengths being
+  // non-zero, 1 for primary only, 2 for secondary only.
+  const int uv_strength_index =
+      (static_cast<int>(uv_primary_strength == 0) << 1) |
+      static_cast<int>(uv_secondary_strength == 0);
+  for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const int block_width = kStep >> subsampling_x;
+    const int block_height = kStep >> subsampling_y;
+    int row4x4 = row4x4_start;
+
+    y_index = 0;
+    do {
+      uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
+      const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+      const uint16_t* cdef_src_base = cdef_src_row_base[plane];
+      int column4x4 = column4x4_start;
+      do {
+        const int cdef_stride = frame_buffer_.stride(plane);
+        uint8_t* const cdef_buffer = cdef_buffer_base;
+        const int src_stride = frame_buffer_.stride(plane);
+        const uint8_t* const src_buffer = src_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
+        const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
+        int dual_cdef = 0;
+
+        if (skip) {  // No cdef filtering.
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
+        } else {
+          // Make sure block pair is not out of bounds.
+          if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
+            // Enable dual processing if subsampling_x is 1.
+            dual_cdef = subsampling_x;
+          }
+
+          int direction = (uv_primary_strength == 0)
+                              ? 0
+                              : kCdefUvDirection[subsampling_x][subsampling_y]
+                                                [direction_y[y_index]];
+
+          if (dual_cdef != 0) {
+            if (uv_primary_strength &&
+                direction_y[y_index] != direction_y[y_index + 1]) {
+              // Disable dual processing if the second block of the pair does
+              // not have the same direction.
+              dual_cdef = 0;
+            }
+
+            // Disable dual processing if the second block of the pair is a
+            // skip.
+            if (direction_y[y_index + 1] == kCdefSkip) {
+              dual_cdef = 0;
+            }
+          }
+
+          // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
+          const int width_index = dual_cdef | (subsampling_x ^ 1);
+          dsp_.cdef_filters[width_index][uv_strength_index](
+              cdef_src, kCdefUnitSizeWithBorders, block_height,
+              uv_primary_strength, uv_secondary_strength,
+              frame_header_.cdef.damping - 1, direction, cdef_buffer,
+              cdef_stride);
+        }
+        // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
+        // so adjust the pointers and indexes for 2 blocks.
+        cdef_buffer_base += column_step[plane] << dual_cdef;
+        src_buffer_base += column_step[plane] << dual_cdef;
+        cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
+        column4x4 += kStep4x4 << dual_cdef;
+        y_index += 1 << dual_cdef;
+      } while (column4x4 < column4x4_start + block_width4x4);
+
+      cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
+      src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+      cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
+      row4x4 += kStep4x4;
+    } while (row4x4 < row4x4_start + block_height4x4);
+  }
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+    uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+    int row4x4, int block_height4x4) {
+  bool use_border_columns[2][2] = {};
+  const bool non_zero_index = frame_header_.cdef.bits > 0;
+  const int8_t* cdef_index =
+      non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
+  int column4x4 = 0;
+  do {
+    const int index = non_zero_index ? *cdef_index++ : 0;
+    const int block_width4x4 =
+        std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+                                    block_height4x4, row4x4, column4x4,
+                                    border_columns, use_border_columns);
+    } else  // NOLINT
+#endif      // LIBGAV1_MAX_BITDEPTH >= 10
+    {
+      ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+                                   block_height4x4, row4x4, column4x4,
+                                   border_columns, use_border_columns);
+    }
+    column4x4 += kStep64x64;
+  } while (column4x4 < frame_header_.columns4x4);
+}
+
+void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                              bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  int row4x4 = row4x4_start;
+  const int row4x4_limit = row4x4_start + sb4x4;
+  do {
+    if (row4x4 >= frame_header_.rows4x4) return;
+
+    // Apply cdef for the last 8 rows of the previous superblock row.
+    // One exception: If the superblock size is 128x128 and is_last_row is true,
+    // then we simply apply cdef for the entire superblock row without any lag.
+    // In that case, apply cdef for the previous superblock row only during the
+    // first iteration (row4x4 == row4x4_start).
+    if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
+      assert(row4x4 >= 16);
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
+    }
+
+    // Apply cdef for the current superblock row. If this is the last superblock
+    // row we apply cdef for all the rows, otherwise we leave out the last 8
+    // rows.
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
+    if (height4x4 > 0) {
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+                                         height4x4);
+    }
+    row4x4 += kStep64x64;
+  } while (row4x4 < row4x4_limit);
+}
+
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+  // Each border_column buffer has to store 64 rows and 2 columns for each
+  // plane. For 10bit, that is 64*2*2 = 256 bytes.
+  alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
+    const int block_height4x4 =
+        std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
+    ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+                                       block_height4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc
new file mode 100644
index 0000000..48ad823
--- /dev/null
+++ b/src/post_filter/deblock.cc
@@ -0,0 +1,507 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <atomic>
+
+#include "src/post_filter.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t HevThresh(int level) { return DivideBy16(level); }
+
+// GetLoopFilterSize* functions depend on this exact ordering of the
+// LoopFilterSize enums.
+static_assert(dsp::kLoopFilterSize4 == 0, "");
+static_assert(dsp::kLoopFilterSize6 == 1, "");
+static_assert(dsp::kLoopFilterSize8 == 2, "");
+static_assert(dsp::kLoopFilterSize14 == 3, "");
+
+dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) {
+  // |filter_length| must be a power of 2.
+  assert((filter_length & (filter_length - 1)) == 0);
+  // This code is the branch free equivalent of:
+  //   if (filter_length == 4) return kLoopFilterSize4;
+  //   if (filter_length == 8) return kLoopFilterSize8;
+  //   return kLoopFilterSize14;
+  return static_cast<dsp::LoopFilterSize>(
+      MultiplyBy2(static_cast<int>(filter_length > 4)) +
+      static_cast<int>(filter_length > 8));
+}
+
+constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) {
+  // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4,
+  // otherwise size is kLoopFilterSize6.
+  return static_cast<dsp::LoopFilterSize>(filter_length != 4);
+}
+
+bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id,
+                               uint8_t* const level) {
+  if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) {
+    return false;
+  }
+  *level = bp.deblock_filter_level[filter_id];
+  return true;
+}
+
+// 7.14.5.
+void ComputeDeblockFilterLevelsHelper(
+    const ObuFrameHeader& frame_header, int segment_id, int level_index,
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) {
+  const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0];
+  uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0,
+                        kMaxLoopFilterValue);
+  const auto feature = static_cast<SegmentFeature>(
+      kSegmentFeatureLoopFilterYVertical + level_index);
+  level =
+      Clip3(level + frame_header.segmentation.feature_data[segment_id][feature],
+            0, kMaxLoopFilterValue);
+  if (!frame_header.loop_filter.delta_enabled) {
+    static_assert(sizeof(deblock_filter_levels[0][0]) == 1, "");
+    memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2);
+    return;
+  }
+  assert(frame_header.loop_filter.delta_enabled);
+  const int shift = level >> 5;
+  deblock_filter_levels[kReferenceFrameIntra][0] = Clip3(
+      level +
+          LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra],
+                    shift),
+      0, kMaxLoopFilterValue);
+  // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does
+  // not have to be populated.
+  for (int reference_frame = kReferenceFrameIntra + 1;
+       reference_frame < kNumReferenceFrameTypes; ++reference_frame) {
+    for (int mode_id = 0; mode_id < 2; ++mode_id) {
+      deblock_filter_levels[reference_frame][mode_id] = Clip3(
+          level +
+              LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] +
+                            frame_header.loop_filter.mode_deltas[mode_id],
+                        shift),
+          0, kMaxLoopFilterValue);
+    }
+  }
+}
+
+}  // namespace
+
+void PostFilter::ComputeDeblockFilterLevels(
+    const int8_t delta_lf[kFrameLfCount],
+    uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount]
+                                 [kNumReferenceFrameTypes][2]) const {
+  if (!DoDeblock()) return;
+  const int num_segments =
+      frame_header_.segmentation.enabled ? kMaxSegments : 1;
+  for (int segment_id = 0; segment_id < num_segments; ++segment_id) {
+    int level_index = 0;
+    for (; level_index < 2; ++level_index) {
+      ComputeDeblockFilterLevelsHelper(
+          frame_header_, segment_id, level_index, delta_lf,
+          deblock_filter_levels[segment_id][level_index]);
+    }
+    for (; level_index < kFrameLfCount; ++level_index) {
+      if (frame_header_.loop_filter.level[level_index] != 0) {
+        ComputeDeblockFilterLevelsHelper(
+            frame_header_, segment_id, level_index, delta_lf,
+            deblock_filter_levels[segment_id][level_index]);
+      }
+    }
+  }
+}
+
+bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4,
+                                                    uint8_t* level, int* step,
+                                                    int* filter_length) const {
+  *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]];
+  if (row4x4 == 0) return false;
+
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  const int row4x4_prev = row4x4 - 1;
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false;
+  } else {
+    const uint8_t level_this = bp->deblock_filter_level[1];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[1];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV(
+    int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step,
+    int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  const int subsampling_y = subsampling_y_[kPlaneU];
+  row4x4 = GetDeblockPosition(row4x4, subsampling_y);
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4);
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformHeight[bp->uv_transform_size];
+  if (row4x4 == subsampling_y) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal];
+  const int row4x4_prev = row4x4 - (1 << subsampling_y);
+  assert(row4x4_prev >= 0);
+  const BlockParameters* bp_prev =
+      block_parameters_.Find(row4x4_prev, column4x4);
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformHeight[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+bool PostFilter::GetVerticalDeblockFilterEdgeInfo(
+    int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level,
+    int* step, int* filter_length) const {
+  const BlockParameters* bp = *bp_ptr;
+  *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]];
+  if (column4x4 == 0) return false;
+
+  const int filter_id = 0;
+  const int column4x4_prev = column4x4 - 1;
+  assert(column4x4_prev >= 0);
+  const BlockParameters* bp_prev = *(bp_ptr - 1);
+  if (bp == bp_prev) {
+    // Not a border.
+    if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false;
+  } else {
+    // It is a border.
+    const uint8_t level_this = bp->deblock_filter_level[filter_id];
+    *level = level_this;
+    if (level_this == 0) {
+      const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id];
+      if (level_prev == 0) return false;
+      *level = level_prev;
+    }
+  }
+  const int step_prev =
+      kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]];
+  *filter_length = std::min(*step, step_prev);
+  return true;
+}
+
+void PostFilter::GetVerticalDeblockFilterEdgeInfoUV(
+    int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u,
+    uint8_t* level_v, int* step, int* filter_length) const {
+  const int subsampling_x = subsampling_x_[kPlaneU];
+  column4x4 = GetDeblockPosition(column4x4, subsampling_x);
+  const BlockParameters* bp = *bp_ptr;
+  *level_u = 0;
+  *level_v = 0;
+  *step = kTransformWidth[bp->uv_transform_size];
+  if (column4x4 == subsampling_x) {
+    return;
+  }
+
+  bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0;
+  bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0;
+  assert(need_filter_u || need_filter_v);
+  const int filter_id_u =
+      kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
+  const int filter_id_v =
+      kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
+  const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
+
+  if (bp == bp_prev) {
+    // Not a border.
+    const bool skip = bp->skip && bp->is_inter;
+    need_filter_u =
+        need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip;
+    need_filter_v =
+        need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip;
+    if (!need_filter_u && !need_filter_v) return;
+    if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u];
+    if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v];
+    *filter_length = *step;
+    return;
+  }
+
+  // It is a border.
+  if (need_filter_u) {
+    const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u];
+    *level_u = level_u_this;
+    if (level_u_this == 0) {
+      *level_u = bp_prev->deblock_filter_level[filter_id_u];
+    }
+  }
+  if (need_filter_v) {
+    const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v];
+    *level_v = level_v_this;
+    if (level_v_this == 0) {
+      *level_v = bp_prev->deblock_filter_level[filter_id_v];
+    }
+  }
+  const int step_prev = kTransformWidth[bp_prev->uv_transform_size];
+  *filter_length = std::min(*step, step_prev);
+}
+
+void PostFilter::HorizontalDeblockFilter(int row4x4_start, int row4x4_end,
+                                         int column4x4_start,
+                                         int column4x4_end) {
+  const int height4x4 = row4x4_end - row4x4_start;
+  const int width4x4 = column4x4_end - column4x4_start;
+  if (height4x4 <= 0 || width4x4 <= 0) return;
+
+  const int column_step = 1;
+  const int src_step = 4 << pixel_size_log2_;
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int row_step;
+  uint8_t level;
+  int filter_length;
+
+  const int width = frame_header_.width;
+  const int height = frame_header_.height;
+  for (int column4x4 = 0;
+       column4x4 < width4x4 && MultiplyBy4(column4x4_start + column4x4) < width;
+       column4x4 += column_step, src += src_step) {
+    uint8_t* src_row = src;
+    for (int row4x4 = 0;
+         row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+         row4x4 += row_step) {
+      const bool need_filter = GetHorizontalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step,
+          &filter_length);
+      if (need_filter) {
+        assert(level > 0 && level <= kMaxLoopFilterValue);
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      // TODO(chengchen): use shifts instead of multiplication.
+      src_row += row_step * src_stride;
+      row_step = DivideBy4(row_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int column_step = 1 << subsampling_x;
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    int row_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    for (int column4x4 = 0; column4x4 < width4x4 &&
+                            MultiplyBy4(column4x4_start + column4x4) < width;
+         column4x4 += column_step, src_u += src_step, src_v += src_step) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      for (int row4x4 = 0;
+           row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+           row4x4 += row_step) {
+        GetHorizontalDeblockFilterEdgeInfoUV(
+            row4x4_start + row4x4, column4x4_start + column4x4, &level_u,
+            &level_v, &row_step, &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][kLoopFilterTypeHorizontal](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += row_step * src_stride_u;
+        src_row_v += row_step * src_stride_v;
+        row_step = DivideBy4(row_step << subsampling_y);
+      }
+    }
+  }
+}
+
+void PostFilter::VerticalDeblockFilter(int row4x4_start, int row4x4_end,
+                                       int column4x4_start, int column4x4_end) {
+  const int height4x4 = row4x4_end - row4x4_start;
+  const int width4x4 = column4x4_end - column4x4_start;
+  if (height4x4 <= 0 || width4x4 <= 0) return;
+
+  const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY));
+  const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
+  uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
+  int column_step;
+  uint8_t level;
+  int filter_length;
+
+  BlockParameters* const* bp_row_base =
+      block_parameters_.Address(row4x4_start, column4x4_start);
+  const int bp_stride = block_parameters_.columns4x4();
+  const int column_step_shift = pixel_size_log2_;
+  const int width = frame_header_.width;
+  const int height = frame_header_.height;
+  for (int row4x4 = 0;
+       row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+       ++row4x4, src += row_stride, bp_row_base += bp_stride) {
+    uint8_t* src_row = src;
+    BlockParameters* const* bp = bp_row_base;
+    for (int column4x4 = 0; column4x4 < width4x4 &&
+                            MultiplyBy4(column4x4_start + column4x4) < width;
+         column4x4 += column_step, bp += column_step) {
+      const bool need_filter = GetVerticalDeblockFilterEdgeInfo(
+          row4x4_start + row4x4, column4x4_start + column4x4, bp, &level,
+          &column_step, &filter_length);
+      if (need_filter) {
+        assert(level > 0 && level <= kMaxLoopFilterValue);
+        const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length);
+        dsp_.loop_filters[size][kLoopFilterTypeVertical](
+            src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
+            HevThresh(level));
+      }
+      src_row += column_step << column_step_shift;
+      column_step = DivideBy4(column_step);
+    }
+  }
+
+  if (needs_chroma_deblock_) {
+    const int8_t subsampling_x = subsampling_x_[kPlaneU];
+    const int8_t subsampling_y = subsampling_y_[kPlaneU];
+    const int row_step = 1 << subsampling_y;
+    uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start);
+    uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start);
+    const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU);
+    const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV);
+    const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU));
+    const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV));
+    const LoopFilterType type = kLoopFilterTypeVertical;
+    int column_step;
+    uint8_t level_u;
+    uint8_t level_v;
+    int filter_length;
+
+    BlockParameters* const* bp_row_base = block_parameters_.Address(
+        GetDeblockPosition(row4x4_start, subsampling_y),
+        GetDeblockPosition(column4x4_start, subsampling_x));
+    const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
+    for (int row4x4 = 0;
+         row4x4 < height4x4 && MultiplyBy4(row4x4_start + row4x4) < height;
+         row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
+             bp_row_base += bp_stride) {
+      uint8_t* src_row_u = src_u;
+      uint8_t* src_row_v = src_v;
+      BlockParameters* const* bp = bp_row_base;
+      for (int column4x4 = 0; column4x4 < width4x4 &&
+                              MultiplyBy4(column4x4_start + column4x4) < width;
+           column4x4 += column_step, bp += column_step) {
+        GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp,
+                                           &level_u, &level_v, &column_step,
+                                           &filter_length);
+        if (level_u != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_u, src_stride_u, outer_thresh_[level_u],
+              inner_thresh_[level_u], HevThresh(level_u));
+        }
+        if (level_v != 0) {
+          const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length);
+          dsp_.loop_filters[size][type](
+              src_row_v, src_stride_v, outer_thresh_[level_v],
+              inner_thresh_[level_v], HevThresh(level_v));
+        }
+        src_row_u += column_step << column_step_shift;
+        src_row_v += column_step << column_step_shift;
+        column_step = DivideBy4(column_step << subsampling_x);
+      }
+    }
+  }
+}
+
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+  const int rows4x4 = frame_header_.rows4x4;
+  const int columns4x4 = frame_header_.columns4x4;
+  int row4x4;
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kNum4x4InLoopFilterUnit, std::memory_order_relaxed)) < rows4x4) {
+    (this->*deblock_filter_func_[loop_filter_type])(
+        row4x4, row4x4 + kNum4x4InLoopFilterUnit, 0, columns4x4);
+  }
+}
+
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+    std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+    std::atomic<int>* row4x4_atomic);
+
+void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
+                                    int row4x4_start, int column4x4_start,
+                                    int column4x4_end, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoDeblock());
+  column4x4_end =
+      std::min(Align(column4x4_end, static_cast<int>(kNum4x4InLoopFilterUnit)),
+               frame_header_.columns4x4);
+  if (column4x4_start >= column4x4_end) return;
+  (this->*deblock_filter_func_[loop_filter_type])(
+      row4x4_start, row4x4_start + sb4x4, column4x4_start, column4x4_end);
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/deblock_thresholds.inc b/src/post_filter/deblock_thresholds.inc
new file mode 100644
index 0000000..ca12aaa
--- /dev/null
+++ b/src/post_filter/deblock_thresholds.inc
@@ -0,0 +1,85 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Thresholds for the deblocking filter. Precomputed values of part of Section
+// 7.14.4 for all possible values of sharpness.
+
+constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = {
+    {1,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+     32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+     48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
+    {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
+    {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}};
+
+constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = {
+    {5,   7,   10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,
+     43,  46,  49,  52,  55,  58,  61,  64,  67,  70,  73,  76,  79,
+     82,  85,  88,  91,  94,  97,  100, 103, 106, 109, 112, 115, 118,
+     121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157,
+     160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+     64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,
+     90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114,
+     116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,
+     63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,
+     89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111, 113,
+     115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  34,
+     36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,
+     62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,
+     88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112,
+     114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136},
+    {5,   7,   9,   11,  14,  16,  19,  21,  24,  26,  29,  31,  33,
+     35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,
+     61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,  85,
+     87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109, 111,
+     113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,
+     60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,
+     86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108, 110,
+     112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  31,
+     33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,
+     59,  61,  63,  65,  67,  69,  71,  73,  75,  77,  79,  81,  83,
+     85,  87,  89,  91,  93,  95,  97,  99,  101, 103, 105, 107, 109,
+     111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133},
+    {5,   7,   9,   11,  13,  15,  17,  19,  22,  24,  26,  28,  30,
+     32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,
+     58,  60,  62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,
+     84,  86,  88,  90,  92,  94,  96,  98,  100, 102, 104, 106, 108,
+     110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}};
diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc
new file mode 100644
index 0000000..2e6982c
--- /dev/null
+++ b/src/post_filter/loop_restoration.cc
@@ -0,0 +1,178 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneRow(
+    const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+    const int plane_height, const int plane_width, const int unit_y,
+    const int unit_row, const int current_process_unit_height,
+    const int plane_unit_size, Pixel* dst_buffer) {
+  const int num_horizontal_units =
+      restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
+  const RestorationUnitInfo* const restoration_info =
+      restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
+                                               unit_row * num_horizontal_units);
+  const bool in_place = DoCdef() || thread_pool_ != nullptr;
+  const Pixel* border = nullptr;
+  ptrdiff_t border_stride = 0;
+  src_buffer += unit_y * stride;
+  if (in_place) {
+    const int border_unit_y = std::max(
+        RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+    border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
+    border =
+        reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+        border_unit_y * border_stride;
+  }
+  int unit_column = 0;
+  int column = 0;
+  do {
+    const int current_process_unit_width =
+        std::min(plane_unit_size, plane_width - column);
+    const Pixel* src = src_buffer + column;
+    unit_column = std::min(unit_column, num_horizontal_units - 1);
+    if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
+      Pixel* dst = dst_buffer + column;
+      if (in_place) {
+        int k = current_process_unit_height;
+        do {
+          memmove(dst, src, current_process_unit_width * sizeof(Pixel));
+          src += stride;
+          dst += stride;
+        } while (--k != 0);
+      } else {
+        CopyPlane(src, stride, current_process_unit_width,
+                  current_process_unit_height, dst, stride);
+      }
+    } else {
+      const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+      ptrdiff_t top_border_stride = stride;
+      const Pixel* bottom_border = src + current_process_unit_height * stride;
+      ptrdiff_t bottom_border_stride = stride;
+      const bool frame_bottom_border =
+          (unit_y + current_process_unit_height >= plane_height);
+      if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+        const Pixel* loop_restoration_border = border + column;
+        if (unit_y != 0) {
+          top_border = loop_restoration_border;
+          top_border_stride = border_stride;
+          loop_restoration_border += 4 * border_stride;
+        }
+        if (!frame_bottom_border) {
+          bottom_border = loop_restoration_border +
+                          kRestorationVerticalBorder * border_stride;
+          bottom_border_stride = border_stride;
+        }
+      }
+      RestorationBuffer restoration_buffer;
+      const LoopRestorationType type = restoration_info[unit_column].type;
+      assert(type == kLoopRestorationTypeSgrProj ||
+             type == kLoopRestorationTypeWiener);
+      const dsp::LoopRestorationFunc restoration_func =
+          dsp_.loop_restorations[type - 2];
+      restoration_func(restoration_info[unit_column], src, stride, top_border,
+                       top_border_stride, bottom_border, bottom_border_stride,
+                       current_process_unit_width, current_process_unit_height,
+                       &restoration_buffer, dst_buffer + column);
+    }
+    ++unit_column;
+    column += plane_unit_size;
+  } while (column < plane_width);
+}
+
+template <typename Pixel>
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+                                                         const int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoRestoration());
+  int plane = kPlaneY;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
+    const int unit_height_offset =
+        kRestorationUnitOffset >> subsampling_y_[plane];
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
+    const int plane_process_unit_height =
+        kRestorationUnitHeight >> subsampling_y_[plane];
+    int y = (row4x4_start == 0)
+                ? 0
+                : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) -
+                      unit_height_offset;
+    int expected_height = plane_process_unit_height -
+                          ((row4x4_start == 0) ? unit_height_offset : 0);
+    int current_process_unit_height;
+    for (int sb_y = 0; sb_y < sb4x4;
+         sb_y += 16, y += current_process_unit_height) {
+      if (y >= plane_height) break;
+      const int unit_row = std::min(
+          (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+          restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
+      current_process_unit_height = std::min(expected_height, plane_height - y);
+      expected_height = plane_process_unit_height;
+      ApplyLoopRestorationForOneRow<Pixel>(
+          reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+          static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+          current_process_unit_height, plane_unit_size,
+          reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
+              y * stride);
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
+    return;
+  }
+#endif
+  ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
+}
+
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+  // subsampling) and hence we need to make sure to cover the last 8 rows of the
+  // last superblock row. So we run this loop for an extra iteration to
+  // accomplish that.
+  const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+                                            std::memory_order_relaxed)) <
+         row4x4_end) {
+    CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+                                   /*for_loop_restoration=*/true);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+          row4x4, kNum4x4InLoopRestorationUnit);
+      continue;
+    }
+#endif
+    ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+        row4x4, kNum4x4InLoopRestorationUnit);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc
new file mode 100644
index 0000000..bc71410
--- /dev/null
+++ b/src/post_filter/post_filter.cc
@@ -0,0 +1,626 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/post_filter/deblock_thresholds.inc"
+
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
+
+}  // namespace
+
+PostFilter::PostFilter(const ObuFrameHeader& frame_header,
+                       const ObuSequenceHeader& sequence_header,
+                       FrameScratchBuffer* const frame_scratch_buffer,
+                       YuvBuffer* const frame_buffer, const dsp::Dsp* dsp,
+                       int do_post_filter_mask)
+    : frame_header_(frame_header),
+      loop_restoration_(frame_header.loop_restoration),
+      dsp_(*dsp),
+      bitdepth_(sequence_header.color_config.bitdepth),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                         : kMaxPlanes),
+      pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+                                                         : sizeof(uint16_t)) -
+                       1),
+      inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
+      outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
+      needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
+                            frame_header.loop_filter.level[kPlaneV + 1] != 0),
+      do_cdef_(DoCdef(frame_header, do_post_filter_mask)),
+      do_deblock_(DoDeblock(frame_header, do_post_filter_mask)),
+      do_restoration_(
+          DoRestoration(loop_restoration_, do_post_filter_mask, planes_)),
+      do_superres_(DoSuperRes(frame_header, do_post_filter_mask)),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      cdef_skip_(frame_scratch_buffer->cdef_skip),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      restoration_info_(&frame_scratch_buffer->loop_restoration_info),
+      superres_coefficients_{
+          frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+          frame_scratch_buffer
+              ->superres_coefficients
+                  [(sequence_header.color_config.is_monochrome ||
+                    sequence_header.color_config.subsampling_x == 0)
+                       ? kPlaneTypeY
+                       : kPlaneTypeUV]
+              .get()},
+      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
+      block_parameters_(frame_scratch_buffer->block_parameters_holder),
+      frame_buffer_(*frame_buffer),
+      cdef_border_(frame_scratch_buffer->cdef_border),
+      loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
+      thread_pool_(
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
+  const int8_t zero_delta_lf[kFrameLfCount] = {};
+  ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
+  if (DoSuperRes()) {
+    int plane = kPlaneY;
+    const int width = frame_header_.width;
+    const int upscaled_width_fh = frame_header_.upscaled_width;
+    do {
+      const int downscaled_width =
+          SubsampledValue(width, subsampling_x_[plane]);
+      const int upscaled_width =
+          SubsampledValue(upscaled_width_fh, subsampling_x_[plane]);
+      const int superres_width = downscaled_width << kSuperResScaleBits;
+      super_res_info_[plane].step =
+          (superres_width + upscaled_width / 2) / upscaled_width;
+      const int error =
+          super_res_info_[plane].step * upscaled_width - superres_width;
+      super_res_info_[plane].initial_subpixel_x =
+          ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) +
+            DivideBy2(upscaled_width)) /
+               upscaled_width +
+           (1 << (kSuperResExtraBits - 1)) - error / 2) &
+          kSuperResScaleMask;
+      super_res_info_[plane].upscaled_width = upscaled_width;
+    } while (++plane < planes_);
+    if (dsp->super_res_coefficients != nullptr) {
+      int plane = kPlaneY;
+      const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+                                superres_coefficients_[kPlaneTypeUV])
+                                   ? kMaxPlanesMonochrome
+                                   : static_cast<int>(kNumPlaneTypes);
+      do {
+        dsp->super_res_coefficients(super_res_info_[plane].upscaled_width,
+                                    super_res_info_[plane].initial_subpixel_x,
+                                    super_res_info_[plane].step,
+                                    superres_coefficients_[plane]);
+      } while (++plane < number_loops);
+    }
+  }
+  int plane = kPlaneY;
+  do {
+    loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
+    cdef_buffer_[plane] = frame_buffer_.data(plane);
+    superres_buffer_[plane] = frame_buffer_.data(plane);
+    source_buffer_[plane] = frame_buffer_.data(plane);
+  } while (++plane < planes_);
+  if (DoCdef() || DoRestoration() || DoSuperRes()) {
+    plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      int horizontal_shift = 0;
+      int vertical_shift = 0;
+      if (DoRestoration() &&
+          loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
+        horizontal_shift += frame_buffer_.alignment();
+        if (!DoCdef() && thread_pool_ == nullptr) {
+          vertical_shift += kRestorationVerticalBorder;
+        }
+        superres_buffer_[plane] +=
+            vertical_shift * frame_buffer_.stride(plane) +
+            (horizontal_shift << pixel_size_log2);
+      }
+      if (DoSuperRes()) {
+        vertical_shift += kSuperResVerticalBorder;
+      }
+      cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                             (horizontal_shift << pixel_size_log2);
+      if (DoCdef() && thread_pool_ == nullptr) {
+        horizontal_shift += frame_buffer_.alignment();
+        vertical_shift += kCdefBorder;
+      }
+      assert(horizontal_shift <= frame_buffer_.right_border(plane));
+      assert(vertical_shift <= frame_buffer_.bottom_border(plane));
+      source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
+                               (horizontal_shift << pixel_size_log2);
+    } while (++plane < planes_);
+  }
+}
+
+// The following example illustrates how ExtendFrame() extends a frame.
+// Suppose the frame width is 8 and height is 4, and left, right, top, and
+// bottom are all equal to 3.
+//
+// Before:
+//
+//       ABCDEFGH
+//       IJKLMNOP
+//       QRSTUVWX
+//       YZabcdef
+//
+// After:
+//
+//   AAA|ABCDEFGH|HHH  [3]
+//   AAA|ABCDEFGH|HHH
+//   AAA|ABCDEFGH|HHH
+//   ---+--------+---
+//   AAA|ABCDEFGH|HHH  [1]
+//   III|IJKLMNOP|PPP
+//   QQQ|QRSTUVWX|XXX
+//   YYY|YZabcdef|fff
+//   ---+--------+---
+//   YYY|YZabcdef|fff  [2]
+//   YYY|YZabcdef|fff
+//   YYY|YZabcdef|fff
+//
+// ExtendFrame() first extends the rows to the left and to the right[1]. Then
+// it copies the extended last row to the bottom borders[2]. Finally it copies
+// the extended first row to the top borders[3].
+// static
+template <typename Pixel>
+void PostFilter::ExtendFrame(Pixel* const frame_start, const int width,
+                             const int height, const ptrdiff_t stride,
+                             const int left, const int right, const int top,
+                             const int bottom) {
+  Pixel* src = frame_start;
+  // Copy to left and right borders.
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src, width, left, right);
+    src += stride;
+  } while (--y != 0);
+  // Copy to bottom borders. For performance we copy |stride| pixels
+  // (including some padding pixels potentially) in each row, ending at the
+  // bottom right border pixel. In the diagram the asterisks indicate padding
+  // pixels.
+  //
+  // |<--- stride --->|
+  // **YYY|YZabcdef|fff <-- Copy from the extended last row.
+  // -----+--------+---
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff
+  // **YYY|YZabcdef|fff <-- bottom right border pixel
+  assert(src == frame_start + height * stride);
+  Pixel* dst = src - left;
+  src = dst - stride;
+  for (int y = 0; y < bottom; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+  // Copy to top borders. For performance we copy |stride| pixels (including
+  // some padding pixels potentially) in each row, starting from the top left
+  // border pixel. In the diagram the asterisks indicate padding pixels.
+  //
+  // +-- top left border pixel
+  // |
+  // v
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // AAA|ABCDEFGH|HHH**
+  // ---+--------+-----
+  // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row.
+  // |<--- stride --->|
+  src = frame_start - left;
+  dst = frame_start - left - top * stride;
+  for (int y = 0; y < top; ++y) {
+    memcpy(dst, src, sizeof(Pixel) * stride);
+    dst += stride;
+  }
+}
+
+template void PostFilter::ExtendFrame<uint8_t>(uint8_t* const frame_start,
+                                               const int width,
+                                               const int height,
+                                               const ptrdiff_t stride,
+                                               const int left, const int right,
+                                               const int top, const int bottom);
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void PostFilter::ExtendFrame<uint16_t>(
+    uint16_t* const frame_start, const int width, const int height,
+    const ptrdiff_t stride, const int left, const int right, const int top,
+    const int bottom);
+#endif
+
+void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start,
+                                     const int width, const int height,
+                                     const ptrdiff_t stride, const int left,
+                                     const int right, const int top,
+                                     const int bottom) const {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth_ >= 10) {
+    ExtendFrame<uint16_t>(reinterpret_cast<uint16_t*>(frame_start), width,
+                          height, stride >> 1, left, right, top, bottom);
+    return;
+  }
+#endif
+  ExtendFrame<uint8_t>(frame_start, width, height, stride, left, right, top,
+                       bottom);
+}
+
+void PostFilter::ExtendBordersForReferenceFrame() {
+  if (frame_header_.refresh_frame_flags == 0) return;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
+           frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
+           frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
+           frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels);
+    // plane subsampling_x_ left_border
+    //   Y        N/A         64, 48
+    //  U,V        0          64, 48
+    //  U,V        1          32, 16
+    assert(frame_buffer_.left_border(plane) >= 16);
+    // The |left| argument to ExtendFrameBoundary() must be at least
+    // kMinLeftBorderPixels (13) for warp.
+    static_assert(16 >= kMinLeftBorderPixels, "");
+    ExtendFrameBoundary(
+        frame_buffer_.data(plane), plane_width, plane_height,
+        frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
+        frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
+        frame_buffer_.bottom_border(plane));
+  } while (++plane < planes_);
+}
+
+void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
+  const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+  const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
+  const int row_offset = DivideBy4(row4x4);
+  const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
+  const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
+                                         subsampling_x_[plane]);
+  const int row_width = num_pixels << pixel_size_log2_;
+  int last_valid_row = -1;
+  const int plane_height =
+      SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+  int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+  const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+  for (int i = 0; i < 4; ++i, ++row) {
+    if (absolute_row + i >= plane_height) {
+      if (last_valid_row == -1) break;
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      row = last_valid_row;
+    }
+    memcpy(dst, src + row * src_stride, row_width);
+    last_valid_row = row;
+    dst += dst_stride;
+  }
+}
+
+void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                bool for_loop_restoration) {
+  // Number of rows to be subtracted from the start position described by
+  // row4x4. We always lag by 8 rows (to account for in-loop post filters).
+  const int row_offset = (row4x4 == 0) ? 0 : 8;
+  // Number of rows to be subtracted from the height described by sb4x4.
+  const int height_offset = (row4x4 == 0) ? 8 : 0;
+  // If cdef is off and post filter multithreading is off, then loop restoration
+  // needs 2 extra rows for the bottom border in each plane.
+  const int extra_rows =
+      (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
+    assert(row >= 0);
+    if (row >= plane_height) break;
+    const int num_rows =
+        std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+                                 subsampling_y_[plane]) +
+                     extra_rows,
+                 plane_height - row);
+    // We only need to track the progress of the Y plane since the progress of
+    // the U and V planes will be inferred from the progress of the Y plane.
+    if (!for_loop_restoration && plane == kPlaneY) {
+      progress_row_ = row + num_rows;
+    }
+    const bool copy_bottom = row + num_rows == plane_height;
+    const ptrdiff_t stride = frame_buffer_.stride(plane);
+    uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane]
+                                                 : frame_buffer_.data(plane)) +
+                           row * stride;
+    const int left_border = for_loop_restoration
+                                ? kRestorationHorizontalBorder
+                                : frame_buffer_.left_border(plane);
+#if LIBGAV1_MSAN
+    // The optimized loop restoration code will overread the visible frame
+    // buffer into the right border. Extend the right boundary further to
+    // prevent msan warnings.
+    const int right_border = for_loop_restoration
+                                 ? kRestorationHorizontalBorder + 16
+                                 : frame_buffer_.right_border(plane);
+#else
+    const int right_border = for_loop_restoration
+                                 ? kRestorationHorizontalBorder
+                                 : frame_buffer_.right_border(plane);
+#endif
+    const int top_border =
+        (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder
+                                           : frame_buffer_.top_border(plane))
+                   : 0;
+    const int bottom_border =
+        copy_bottom
+            ? (for_loop_restoration ? kRestorationVerticalBorder
+                                    : frame_buffer_.bottom_border(plane))
+            : 0;
+    ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
+                        right_border, top_border, bottom_border);
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+  assert(row4x4 >= 0);
+  assert(!DoCdef());
+  assert(DoRestoration());
+  const int upscaled_width = frame_header_.upscaled_width;
+  const int height = frame_header_.height;
+  int plane = kPlaneY;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels =
+        SubsampledValue(upscaled_width, subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+    const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+    const int absolute_row =
+        (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const uint8_t* src =
+        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+        row * src_stride;
+    const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+    uint8_t* dst =
+        loop_restoration_border_.data(plane) + row_offset * dst_stride;
+    for (int i = 0; i < 4; ++i) {
+      memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (bitdepth_ >= 10) {
+        ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                             kRestorationHorizontalBorder);
+      } else  // NOLINT.
+#endif
+        ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                            kRestorationHorizontalBorder);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      if (absolute_row + i < plane_height - 1) src += src_stride;
+      dst += dst_stride;
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  assert(DoRestoration());
+  for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+    const int row4x4 = row4x4_start + sb_y;
+    const int row_offset_start = DivideBy4(row4x4);
+    const std::array<uint8_t*, kMaxPlanes> dst = {
+        loop_restoration_border_.data(kPlaneY) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneY)),
+        loop_restoration_border_.data(kPlaneU) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneU)),
+        loop_restoration_border_.data(kPlaneV) +
+            row_offset_start * static_cast<ptrdiff_t>(
+                                   loop_restoration_border_.stride(kPlaneV))};
+    // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+    // directly with |loop_restoration_border_| as the destination. Otherwise,
+    // we simply copy the rows.
+    if (DoSuperRes()) {
+      std::array<uint8_t*, kMaxPlanes> src;
+      std::array<int, kMaxPlanes> rows;
+      const int height = frame_header_.height;
+      int plane = kPlaneY;
+      do {
+        if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+          rows[plane] = 0;
+          continue;
+        }
+        const int plane_height = SubsampledValue(height, subsampling_y_[plane]);
+        const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+        const int absolute_row =
+            (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+        src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+                     row * static_cast<ptrdiff_t>(frame_buffer_.stride(plane));
+        rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+      } while (++plane < planes_);
+      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+                    /*dst_is_loop_restoration_border=*/true);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      plane = kPlaneY;
+      do {
+        if (rows[plane] == 0 || rows[plane] >= 4) continue;
+        const ptrdiff_t stride = loop_restoration_border_.stride(plane);
+        uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+        const uint8_t* const src_line = dst_line - stride;
+        const int upscaled_width = super_res_info_[plane].upscaled_width
+                                   << pixel_size_log2_;
+        for (int i = rows[plane]; i < 4; ++i) {
+          memcpy(dst_line, src_line, upscaled_width);
+          dst_line += stride;
+        }
+      } while (++plane < planes_);
+    } else {
+      int plane = kPlaneY;
+      do {
+        CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+      } while (++plane < planes_);
+    }
+    // Extend the left and right boundaries needed for loop restoration.
+    const int upscaled_width = frame_header_.upscaled_width;
+    int plane = kPlaneY;
+    do {
+      if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+        continue;
+      }
+      uint8_t* dst_line = dst[plane];
+      const int plane_width =
+          SubsampledValue(upscaled_width, subsampling_x_[plane]);
+      for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        if (bitdepth_ >= 10) {
+          ExtendLine<uint16_t>(dst_line, plane_width,
+                               kRestorationHorizontalBorder,
+                               kRestorationHorizontalBorder);
+        } else  // NOLINT.
+#endif
+        {
+          ExtendLine<uint8_t>(dst_line, plane_width,
+                              kRestorationHorizontalBorder,
+                              kRestorationHorizontalBorder);
+        }
+        dst_line += loop_restoration_border_.stride(plane);
+      }
+    } while (++plane < planes_);
+  }
+}
+
+void PostFilter::RunJobs(WorkerFunction worker) {
+  std::atomic<int> row4x4(0);
+  const int num_workers = thread_pool_->num_threads();
+  BlockingCounter pending_workers(num_workers);
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+      (this->*worker)(&row4x4);
+      pending_workers.Decrement();
+    });
+  }
+  // Run the jobs on the current thread.
+  (this->*worker)(&row4x4);
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+void PostFilter::ApplyFilteringThreaded() {
+  if (DoDeblock()) {
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+  }
+  if (DoCdef() && DoRestoration()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
+    }
+  }
+  if (DoCdef()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupCdefBorder(row4x4);
+    }
+    RunJobs(&PostFilter::ApplyCdefWorker);
+  }
+  if (DoSuperRes()) ApplySuperResThreaded();
+  if (DoRestoration()) {
+    if (!DoCdef()) {
+      int row4x4 = 0;
+      do {
+        SetupLoopRestorationBorder(row4x4);
+        row4x4 += kNum4x4InLoopFilterUnit;
+      } while (row4x4 < frame_header_.rows4x4);
+    }
+    RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+  }
+  ExtendBordersForReferenceFrame();
+}
+
+int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4,
+                                                  bool is_last_row,
+                                                  bool do_deblock) {
+  if (row4x4 < 0) return -1;
+  if (DoDeblock() && do_deblock) {
+    VerticalDeblockFilter(row4x4, row4x4 + sb4x4, 0, frame_header_.columns4x4);
+    HorizontalDeblockFilter(row4x4, row4x4 + sb4x4, 0,
+                            frame_header_.columns4x4);
+  }
+  if (DoRestoration() && DoCdef()) {
+    SetupLoopRestorationBorder(row4x4, sb4x4);
+  }
+  if (DoCdef()) {
+    ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoSuperRes()) {
+    ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
+  }
+  if (DoRestoration()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true);
+    ApplyLoopRestoration(row4x4, sb4x4);
+    if (is_last_row) {
+      // Loop restoration operates with a lag of 8 rows. So make sure to cover
+      // all the rows of the last superblock row.
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true);
+      ApplyLoopRestoration(row4x4 + sb4x4, 16);
+    }
+  }
+  if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) {
+    CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false);
+    if (is_last_row) {
+      CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false);
+    }
+  }
+  if (is_last_row && !DoBorderExtensionInLoop()) {
+    ExtendBordersForReferenceFrame();
+  }
+  return is_last_row ? frame_header_.height : progress_row_;
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc
new file mode 100644
index 0000000..2133a8a
--- /dev/null
+++ b/src/post_filter/super_res.cc
@@ -0,0 +1,212 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "src/post_filter.h"
+#include "src/utils/blocking_counter.h"
+
+namespace libgav1 {
+
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
+                               const std::array<int, kMaxPlanes>& rows,
+                               const int line_buffer_row,
+                               const std::array<uint8_t*, kMaxPlanes>& dst,
+                               bool dst_is_loop_restoration_border /*=false*/) {
+  int plane = kPlaneY;
+  do {
+    const int plane_width =
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ >= 10) {
+      auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+      auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+      const ptrdiff_t input_stride =
+          frame_buffer_.stride(plane) / sizeof(uint16_t);
+      const ptrdiff_t output_stride =
+          (dst_is_loop_restoration_border
+               ? loop_restoration_border_.stride(plane)
+               : frame_buffer_.stride(plane)) /
+          sizeof(uint16_t);
+      if (rows[plane] > 0) {
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       input, input_stride, rows[plane], plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step, output, output_stride);
+      }
+      // In the multi-threaded case, the |superres_line_buffer_| holds the last
+      // input row. Apply SuperRes for that row.
+      if (line_buffer_row >= 0) {
+        auto* const line_buffer_start =
+            reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+            line_buffer_row * superres_line_buffer_.stride(plane) /
+                sizeof(uint16_t) +
+            kSuperResHorizontalBorder;
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       line_buffer_start, /*source_stride=*/0,
+                       /*height=*/1, plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step,
+                       output + rows[plane] * output_stride, /*dest_stride=*/0);
+      }
+      continue;
+    }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    uint8_t* input = src[plane];
+    uint8_t* output = dst[plane];
+    const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t output_stride = dst_is_loop_restoration_border
+                                        ? loop_restoration_border_.stride(plane)
+                                        : frame_buffer_.stride(plane);
+    if (rows[plane] > 0) {
+      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                     input, input_stride, rows[plane], plane_width,
+                     super_res_info_[plane].upscaled_width,
+                     super_res_info_[plane].initial_subpixel_x,
+                     super_res_info_[plane].step, output, output_stride);
+    }
+    // In the multi-threaded case, the |superres_line_buffer_| holds the last
+    // input row. Apply SuperRes for that row.
+    if (line_buffer_row >= 0) {
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          kSuperResHorizontalBorder;
+      dsp_.super_res(
+          superres_coefficients_[static_cast<int>(plane != 0)],
+          line_buffer_start, /*source_stride=*/0,
+          /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+          super_res_info_[plane].initial_subpixel_x,
+          super_res_info_[plane].step, output + rows[plane] * output_stride,
+          /*dest_stride=*/0);
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
+                                                  bool is_last_row) {
+  assert(row4x4_start >= 0);
+  assert(DoSuperRes());
+  // If not doing cdef, then LR needs two rows of border with superres applied.
+  const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
+  std::array<uint8_t*, kMaxPlanes> src;
+  std::array<uint8_t*, kMaxPlanes> dst;
+  std::array<int, kMaxPlanes> rows;
+  const int num_rows4x4 =
+      std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
+      (is_last_row ? 0 : 2);
+  if (row4x4_start > 0) {
+    const int row4x4 = row4x4_start - 2;
+    int plane = kPlaneY;
+    do {
+      const int row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+      const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| subtraction is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      // Apply superres for the last 8-|num_rows_extra| rows of the previous
+      // superblock.
+      rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+      // Apply superres for the current superblock row (except for the last
+      // 8-|num_rows_extra| rows).
+      rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                     (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  } else {
+    // Apply superres for the current superblock row (except for the last
+    // 8-|num_rows_extra| rows).
+    int plane = kPlaneY;
+    do {
+      const ptrdiff_t row_offset =
+          (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+          frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| addition is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                    (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  }
+  ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
+}
+
+void PostFilter::ApplySuperResThreaded() {
+  int num_threads = thread_pool_->num_threads() + 1;
+  // The number of rows that will be processed by each thread in the thread pool
+  // (other than the current thread).
+  int thread_pool_rows = frame_header_.height / num_threads;
+  thread_pool_rows = std::max(thread_pool_rows, 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++thread_pool_rows;
+  }
+  // Adjust the number of threads to what we really need.
+  num_threads = Clip3(frame_header_.height / thread_pool_rows, 1, num_threads);
+  // For the current thread, we round up to process all the remaining rows.
+  int current_thread_rows =
+      frame_header_.height - thread_pool_rows * (num_threads - 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++current_thread_rows;
+  }
+  assert(current_thread_rows > 0);
+  BlockingCounter pending_workers(num_threads - 1);
+  for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+       ++line_buffer_row, row_start += thread_pool_rows) {
+    std::array<uint8_t*, kMaxPlanes> src;
+    std::array<uint8_t*, kMaxPlanes> dst;
+    std::array<int, kMaxPlanes> rows;
+    int plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      src[plane] =
+          GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      dst[plane] =
+          GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      rows[plane] =
+          (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+                                                : current_thread_rows) >>
+           subsampling_y_[plane]) -
+          1;
+      const int plane_width =
+          MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+      uint8_t* const input =
+          src[plane] + rows[plane] * frame_buffer_.stride(plane);
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          (kSuperResHorizontalBorder << pixel_size_log2);
+      memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+    } while (++plane < planes_);
+    if (line_buffer_row < num_threads - 1) {
+      thread_pool_->Schedule(
+          [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+            ApplySuperRes(src, rows, line_buffer_row, dst);
+            pending_workers.Decrement();
+          });
+    } else {
+      ApplySuperRes(src, rows, line_buffer_row, dst);
+    }
+  }
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
+}  // namespace libgav1
diff --git a/src/post_filter_test.cc b/src/post_filter_test.cc
new file mode 100644
index 0000000..db9d0f4
--- /dev/null
+++ b/src/post_filter_test.cc
@@ -0,0 +1,956 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/post_filter.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/cdef.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/super_res.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/threading_strategy.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+#include "tests/block_utils.h"
+#include "tests/third_party/libvpx/acm_random.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr char kCdef[] = "Cdef";
+constexpr char kApplyCdefName[] = "ApplyCdef";
+constexpr int kMaxBlockWidth4x4 = 32;
+constexpr int kMaxBlockHeight4x4 = 32;
+constexpr int kMaxTestFrameSize = 1920 * 1080;
+
+int GetIdFromInputParam(int subsampling_x, int subsampling_y, int height) {
+  int id = subsampling_x * 8 + subsampling_y * 4;
+  if (height == 288) {
+    id += 0;
+  } else if (height == 480) {
+    id += 1;
+  } else if (height == 1080) {
+    id += 2;
+  } else {
+    id += 3;
+  }
+  return id;
+}
+
+const char* GetSuperResDigest8bpp(int id, int plane) {
+  static const char* const kDigestSuperRes[][kMaxPlanes] = {
+      {
+          // all input is 0.
+          "ff5f7a63d3b1f9176e216eb01a0387ad",  // kPlaneY.
+          "38b6551d7ac3e86c8af407d5a1aa36dc",  // kPlaneU.
+          "38b6551d7ac3e86c8af407d5a1aa36dc",  // kPlaneV.
+      },
+      {
+          // all input is 1.
+          "819f21dcce0e779180bbd613a9e3543c",  // kPlaneY.
+          "e784bfa8f517d83b014c3dcd45b780a5",  // kPlaneU.
+          "e784bfa8f517d83b014c3dcd45b780a5",  // kPlaneV.
+      },
+      {
+          // all input is 128.
+          "2d6ea5b39f9168d56c2e2b8846d208ec",  // kPlaneY.
+          "8030b6e70f1544efbc37b902d3f88bd3",  // kPlaneU.
+          "8030b6e70f1544efbc37b902d3f88bd3",  // kPlaneV.
+      },
+      {
+          // all input is 255.
+          "5c0b4bc50e0980dc6ba7c042d3b50a5e",  // kPlaneY.
+          "3c566ef847c45be09ddac297123a3bad",  // kPlaneU.
+          "3c566ef847c45be09ddac297123a3bad",  // kPlaneV.
+      },
+      {
+          // random input.
+          "50514467dd6a5c3a8268eddaa542c41f",  // kPlaneY.
+          "3ce720c2b5b44928e1477b11040e5c00",  // kPlaneU.
+          "3ce720c2b5b44928e1477b11040e5c00",  // kPlaneV.
+      },
+  };
+  return kDigestSuperRes[id][plane];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetSuperResDigest10bpp(int id, int plane) {
+  // Digests are in Y/U/V order.
+  static const char* const kDigestSuperRes[][kMaxPlanes] = {
+      {
+          // all input is 0.
+          "fccb1f57b252b1a86d335aea929d1d58",
+          "2f244a56091c9705794e92e6bcc38058",
+          "2f244a56091c9705794e92e6bcc38058",
+      },
+      {
+          // all input is 1.
+          "de8556204999d6e4bf74cfdde61a095b",
+          "e7d0f4ce6df81c46de95da7790a67384",
+          "e7d0f4ce6df81c46de95da7790a67384",
+      },
+      {
+          // all input is 512.
+          "d3b6980363eb9b808885537b3485af87",
+          "bcffddb26210da6861e7b31414e58b77",
+          "bcffddb26210da6861e7b31414e58b77",
+      },
+      {
+          // all input is 1023.
+          "ce0762aeee1cdef1db101e4ca39bcbd6",
+          "33aeaa7f5d7c032e3dfda43925c3dcb2",
+          "33aeaa7f5d7c032e3dfda43925c3dcb2",
+      },
+      {
+          // random input.
+          "63c701bceb187ffa535be15ae58f8171",
+          "f570e30e9ea8d2a1e6d99202cd2f8994",
+          "f570e30e9ea8d2a1e6d99202cd2f8994",
+      },
+  };
+  return kDigestSuperRes[id][plane];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+// This type is used to parameterize the tests so is defined outside the
+// anonymous namespace to avoid the GCC -Wsubobject-linkage warning.
+struct FrameSizeParam {
+  FrameSizeParam(uint32_t width, uint32_t upscaled_width, uint32_t height,
+                 int8_t ss_x, int8_t ss_y)
+      : width(width),
+        upscaled_width(upscaled_width),
+        height(height),
+        subsampling_x(ss_x),
+        subsampling_y(ss_y) {}
+  uint32_t width;
+  uint32_t upscaled_width;
+  uint32_t height;
+  int8_t subsampling_x;
+  int8_t subsampling_y;
+};
+
+// Print operators must be defined in the same namespace as the type for the
+// lookup to work correctly.
+static std::ostream& operator<<(std::ostream& os, const FrameSizeParam& param) {
+  return os << param.width << "x" << param.height
+            << ", upscaled_width: " << param.upscaled_width
+            << ", subsampling(x/y): " << static_cast<int>(param.subsampling_x)
+            << "/" << static_cast<int>(param.subsampling_y);
+}
+
+// Note the following test classes access private functions/members of
+// PostFilter. To be declared friends of PostFilter they must not have internal
+// linkage (they must be outside the anonymous namespace).
+template <int bitdepth, typename Pixel>
+class PostFilterTestBase : public testing::TestWithParam<FrameSizeParam> {
+ public:
+  PostFilterTestBase() = default;
+  PostFilterTestBase(const PostFilterTestBase&) = delete;
+  PostFilterTestBase& operator=(const PostFilterTestBase&) = delete;
+  ~PostFilterTestBase() override = default;
+
+  void SetUp() override {
+    // Allocate buffer_ with a border size of kBorderPixels (which is
+    // subsampled for chroma planes). Some tests (for loop restoration) only use
+    // the nearest 2 or 3 pixels (for both luma and chroma planes) in the
+    // border.
+    ASSERT_TRUE(buffer_.Realloc(
+        bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+        frame_size_.height, frame_size_.subsampling_x,
+        frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+
+    ASSERT_TRUE(loop_restoration_border_.Realloc(
+        bitdepth, /*is_monochrome=*/false, frame_size_.upscaled_width,
+        frame_size_.height, frame_size_.subsampling_x,
+        frame_size_.subsampling_y, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int8_t subsampling_x =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+      const int8_t subsampling_y =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+      width_[plane] = frame_size_.width >> subsampling_x;
+      upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+      stride_[plane] =
+          (frame_size_.upscaled_width + 2 * kBorderPixels) >> subsampling_x;
+      height_[plane] =
+          (frame_size_.height + 2 * kBorderPixels) >> subsampling_y;
+
+      reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+      reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), 0);
+    }
+  }
+
+ protected:
+  YuvBuffer buffer_;
+  YuvBuffer cdef_border_;
+  YuvBuffer loop_restoration_border_;
+  uint32_t width_[kMaxPlanes];
+  uint32_t upscaled_width_[kMaxPlanes];
+  uint32_t stride_[kMaxPlanes];
+  uint32_t height_[kMaxPlanes];
+  std::vector<Pixel> reference_buffer_[kMaxPlanes];
+  const FrameSizeParam frame_size_ = GetParam();
+};
+
+template <int bitdepth, typename Pixel>
+class PostFilterHelperFuncTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+  PostFilterHelperFuncTest() = default;
+  PostFilterHelperFuncTest(const PostFilterHelperFuncTest&) = delete;
+  PostFilterHelperFuncTest& operator=(const PostFilterHelperFuncTest&) = delete;
+  ~PostFilterHelperFuncTest() override = default;
+
+ protected:
+  using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::cdef_border_;
+  using PostFilterTestBase<bitdepth, Pixel>::loop_restoration_border_;
+  using PostFilterTestBase<bitdepth, Pixel>::width_;
+  using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+  using PostFilterTestBase<bitdepth, Pixel>::stride_;
+  using PostFilterTestBase<bitdepth, Pixel>::height_;
+  using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+  void SetUp() override {
+    PostFilterTestBase<bitdepth, Pixel>::SetUp();
+
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      const int8_t subsampling_x =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+      const int8_t subsampling_y =
+          (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+      width_[plane] = frame_size_.width >> subsampling_x;
+      upscaled_width_[plane] = frame_size_.upscaled_width >> subsampling_x;
+      stride_[plane] = (frame_size_.upscaled_width >> subsampling_x) +
+                       2 * kRestorationHorizontalBorder;
+      height_[plane] = (frame_size_.height >> subsampling_y) +
+                       2 * kRestorationVerticalBorder;
+      reference_buffer_[plane].reserve(stride_[plane] * height_[plane]);
+      reference_buffer_[plane].resize(stride_[plane] * height_[plane]);
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), 0);
+      buffer_border_corner_[plane] =
+          reinterpret_cast<Pixel*>(buffer_.data(plane)) -
+          buffer_.stride(plane) / sizeof(Pixel) * kRestorationVerticalBorder -
+          kRestorationHorizontalBorder;
+      loop_restoration_border_corner_[plane] =
+          reinterpret_cast<Pixel*>(loop_restoration_border_.data(plane)) -
+          loop_restoration_border_.stride(plane) / sizeof(Pixel) *
+              kRestorationVerticalBorder -
+          kRestorationHorizontalBorder;
+    }
+  }
+
+  void TestExtendFrame(bool use_fixed_values, Pixel value);
+  void TestAdjustFrameBufferPointer();
+  void TestPrepareLoopRestorationBlock();
+
+  // Fill the frame buffer with either a fixed value, or random values.
+  // If fill in with random values, make special operations at buffer
+  // boundaries. Make the outer most 3 pixel wide borders the same value
+  // as their immediate inner neighbor. For example:
+  // 4 4 4   4 5 6   6 6 6
+  // 4 4 4   4 5 6   6 6 6
+  // 4 4 4   4 5 6   6 6 6
+  //       ---------
+  // 4 4 4 | 4 5 6 | 6 6 6
+  // 1 1 1 | 1 0 1 | 1 1 1
+  // 0 0 0 | 0 1 0 | 0 0 0
+  // 1 1 1 | 1 0 1 | 1 1 1
+  // 0 0 0 | 0 1 0 | 0 0 0
+  // 6 6 6 | 6 5 4 | 4 4 4
+  //        -------
+  // 6 6 6   6 5 4   4 4 4
+  // 6 6 6   6 5 4   4 4 4
+  // 6 6 6   6 5 4   4 4 4
+  // Pixels within box is the current block. Outside is extended area from it.
+  void FillBuffer(bool use_fixed_values, Pixel value);
+
+  // Points to the upper left corner of the restoration border in buffer_.
+  Pixel* buffer_border_corner_[kMaxPlanes];
+  // Points to the upper left corner of the restoration border in
+  // loop_restoration_border_.
+  Pixel* loop_restoration_border_corner_[kMaxPlanes];
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::FillBuffer(
+    bool use_fixed_values, Pixel value) {
+  if (use_fixed_values) {
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      // Fill buffer with a fixed value.
+      std::fill(reference_buffer_[plane].begin(),
+                reference_buffer_[plane].end(), value);
+      // Fill frame buffer. Note that the border is not filled.
+      auto* row = reinterpret_cast<Pixel*>(buffer_.data(plane));
+      for (int i = 0; i < buffer_.height(plane); ++i) {
+        std::fill(row, row + width_[plane], value);
+        row += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    }
+  } else {  // Random value.
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    const int mask = (1 << bitdepth) - 1;
+    for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+      // Fill buffer with random values.
+      std::vector<Pixel> line_buffer(stride_[plane]);
+      std::fill(line_buffer.begin(), line_buffer.end(), 0);
+      for (int i = kRestorationHorizontalBorder;
+           i < stride_[plane] - kRestorationHorizontalBorder; ++i) {
+        line_buffer[i] = rnd.Rand16() & mask;
+      }
+      // Copy boundary values to extended border.
+      for (int i = 0; i < kRestorationHorizontalBorder; ++i) {
+        line_buffer[i] = line_buffer[kRestorationHorizontalBorder];
+        line_buffer[stride_[plane] - i - 1] =
+            line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+      }
+      // The first three rows are the same as the line_buffer.
+      for (int i = 0; i < kRestorationVerticalBorder + 1; ++i) {
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() + i * stride_[plane]);
+      }
+      for (int i = kRestorationVerticalBorder + 1;
+           i < height_[plane] - kRestorationVerticalBorder; ++i) {
+        for (int j = kRestorationHorizontalBorder;
+             j < stride_[plane] - kRestorationHorizontalBorder; ++j) {
+          line_buffer[j] = rnd.Rand16() & mask;
+        }
+        for (int j = 0; j < kRestorationHorizontalBorder; ++j) {
+          line_buffer[j] = line_buffer[kRestorationHorizontalBorder];
+          line_buffer[stride_[plane] - j - 1] =
+              line_buffer[stride_[plane] - 1 - kRestorationHorizontalBorder];
+        }
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() + i * stride_[plane]);
+      }
+      // The extended border are the same as the line_buffer.
+      for (int i = 0; i < kRestorationVerticalBorder; ++i) {
+        std::copy(line_buffer.begin(), line_buffer.end(),
+                  reference_buffer_[plane].begin() +
+                      (height_[plane] - kRestorationVerticalBorder + i) *
+                          stride_[plane]);
+      }
+
+      // Fill frame buffer. Note that the border is not filled.
+      for (int i = 0; i < buffer_.height(plane); ++i) {
+        memcpy(buffer_.data(plane) + i * buffer_.stride(plane),
+               reference_buffer_[plane].data() + kRestorationHorizontalBorder +
+                   (i + kRestorationVerticalBorder) * stride_[plane],
+               sizeof(Pixel) * width_[plane]);
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterHelperFuncTest<bitdepth, Pixel>::TestExtendFrame(
+    bool use_fixed_values, Pixel value) {
+  ObuFrameHeader frame_header = {};
+  frame_header.upscaled_width = frame_size_.upscaled_width;
+  frame_header.width = frame_size_.width;
+  frame_header.height = frame_size_.height;
+  ObuSequenceHeader sequence_header;
+  sequence_header.color_config.bitdepth = bitdepth;
+  sequence_header.color_config.is_monochrome = false;
+  sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+  sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+  const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  FrameScratchBuffer frame_scratch_buffer;
+
+  PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+                         &buffer_, dsp,
+                         /*do_post_filter_mask=*/0x00);
+  FillBuffer(use_fixed_values, value);
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int plane_width =
+        plane == kPlaneY ? frame_header.upscaled_width
+                         : frame_header.upscaled_width >>
+                               sequence_header.color_config.subsampling_x;
+    const int plane_height =
+        plane == kPlaneY
+            ? frame_header.height
+            : frame_header.height >> sequence_header.color_config.subsampling_y;
+    PostFilter::ExtendFrame<Pixel>(
+        reinterpret_cast<Pixel*>(buffer_.data(plane)), plane_width,
+        plane_height, buffer_.stride(plane) / sizeof(Pixel),
+        kRestorationHorizontalBorder, kRestorationHorizontalBorder,
+        kRestorationVerticalBorder, kRestorationVerticalBorder);
+    const bool success = test_utils::CompareBlocks<Pixel>(
+        buffer_border_corner_[plane], reference_buffer_[plane].data(),
+        stride_[plane], height_[plane], buffer_.stride(plane) / sizeof(Pixel),
+        stride_[plane], /*check_padding=*/false, /*print_diff=*/false);
+    ASSERT_TRUE(success) << "Failure of extend frame at plane: " << plane;
+  }
+}
+
+template <int bitdepth, typename Pixel>
+class PostFilterSuperResTest : public PostFilterTestBase<bitdepth, Pixel> {
+ public:
+  PostFilterSuperResTest() {
+    test_utils::ResetDspTable(bitdepth);
+    dsp::SuperResInit_C();
+    dsp::SuperResInit_SSE4_1();
+    dsp::SuperResInit_NEON();
+  }
+  PostFilterSuperResTest(const PostFilterSuperResTest&) = delete;
+  PostFilterSuperResTest& operator=(const PostFilterSuperResTest&) = delete;
+  ~PostFilterSuperResTest() override = default;
+
+ protected:
+  using PostFilterTestBase<bitdepth, Pixel>::buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::width_;
+  using PostFilterTestBase<bitdepth, Pixel>::upscaled_width_;
+  using PostFilterTestBase<bitdepth, Pixel>::stride_;
+  using PostFilterTestBase<bitdepth, Pixel>::height_;
+  using PostFilterTestBase<bitdepth, Pixel>::reference_buffer_;
+  using PostFilterTestBase<bitdepth, Pixel>::frame_size_;
+
+  void TestApplySuperRes(bool use_fixed_values, Pixel value, int id,
+                         bool multi_threaded);
+};
+
+// This class must be in namespace libgav1 to access private member function
+// of class PostFilter in src/post_filter.h.
+template <int bitdepth, typename Pixel>
+void PostFilterSuperResTest<bitdepth, Pixel>::TestApplySuperRes(
+    bool use_fixed_values, Pixel value, int id, bool multi_threaded) {
+  ObuFrameHeader frame_header = {};
+  frame_header.width = frame_size_.width;
+  frame_header.upscaled_width = frame_size_.upscaled_width;
+  frame_header.height = frame_size_.height;
+  frame_header.rows4x4 = DivideBy4(frame_size_.height);
+  frame_header.columns4x4 = DivideBy4(frame_size_.width);
+  frame_header.tile_info.tile_count = 1;
+  ObuSequenceHeader sequence_header;
+  sequence_header.color_config.bitdepth = bitdepth;
+  sequence_header.color_config.is_monochrome = false;
+  sequence_header.color_config.subsampling_x = frame_size_.subsampling_x;
+  sequence_header.color_config.subsampling_y = frame_size_.subsampling_y;
+
+  // Apply SuperRes.
+  Array2D<int16_t> cdef_index;
+  Array2D<TransformSize> inter_transform_sizes;
+  const dsp::Dsp* const dsp = dsp::GetDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  constexpr int kNumThreads = 4;
+  FrameScratchBuffer frame_scratch_buffer;
+  if (multi_threaded) {
+    ASSERT_TRUE(frame_scratch_buffer.threading_strategy.Reset(frame_header,
+                                                              kNumThreads));
+  }
+  const int pixel_size = sequence_header.color_config.bitdepth == 8
+                             ? sizeof(uint8_t)
+                             : sizeof(uint16_t);
+  ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeY].Resize(
+      kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+      pixel_size));
+  if (!sequence_header.color_config.is_monochrome &&
+      sequence_header.color_config.subsampling_x != 0) {
+    ASSERT_TRUE(frame_scratch_buffer.superres_coefficients[kPlaneTypeUV].Resize(
+        kSuperResFilterTaps *
+        Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+        pixel_size));
+  }
+  ASSERT_TRUE(frame_scratch_buffer.superres_line_buffer.Realloc(
+      sequence_header.color_config.bitdepth,
+      sequence_header.color_config.is_monochrome,
+      MultiplyBy4(frame_header.columns4x4), (multi_threaded ? kNumThreads : 1),
+      sequence_header.color_config.subsampling_x,
+      /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+      2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+      nullptr, nullptr, nullptr));
+  PostFilter post_filter(frame_header, sequence_header, &frame_scratch_buffer,
+                         &buffer_, dsp,
+                         /*do_post_filter_mask=*/0x04);
+
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  int width[kMaxPlanes];
+  int upscaled_width[kMaxPlanes];
+  int height[kMaxPlanes];
+
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    const int8_t subsampling_x =
+        (plane == kPlaneY) ? 0 : frame_size_.subsampling_x;
+    const int8_t subsampling_y =
+        (plane == kPlaneY) ? 0 : frame_size_.subsampling_y;
+    width[plane] = frame_size_.width >> subsampling_x;
+    upscaled_width[plane] = frame_size_.upscaled_width >> subsampling_x;
+    height[plane] = frame_size_.height >> subsampling_y;
+    if (use_fixed_values) {
+      auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+      for (int y = 0; y < height[plane]; ++y) {
+        for (int x = 0; x < width[plane]; ++x) {
+          src[x] = value;
+        }
+        src += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    } else {  // Random input.
+      const int mask = (1 << bitdepth) - 1;
+      libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+      auto* src = reinterpret_cast<Pixel*>(post_filter.cdef_buffer_[plane]);
+      for (int y = 0; y < height[plane]; ++y) {
+        for (int x = 0; x < width[plane]; ++x) {
+          src[x] = rnd.Rand16() & mask;
+        }
+        src += buffer_.stride(plane) / sizeof(Pixel);
+      }
+    }
+  }
+
+  if (multi_threaded) {
+    post_filter.ApplySuperResThreaded();
+  } else {
+    std::array<uint8_t*, kMaxPlanes> buffers = {
+        post_filter.cdef_buffer_[kPlaneY], post_filter.cdef_buffer_[kPlaneU],
+        post_filter.cdef_buffer_[kPlaneV]};
+    std::array<uint8_t*, kMaxPlanes> dst = {
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneY), 0, 0),
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneU), 0, 0),
+        post_filter.GetSuperResBuffer(static_cast<Plane>(kPlaneV), 0, 0)};
+    std::array<int, kMaxPlanes> rows = {
+        frame_header.rows4x4 * 4,
+        (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y,
+        (frame_header.rows4x4 * 4) >> frame_size_.subsampling_y};
+    post_filter.ApplySuperRes(buffers, rows, /*line_buffer_row=*/-1, dst);
+  }
+
+  // Check md5.
+  std::vector<Pixel> output;
+  for (int plane = kPlaneY; plane < num_planes; ++plane) {
+    output.reserve(upscaled_width[plane] * height[plane]);
+    output.resize(upscaled_width[plane] * height[plane]);
+    auto* dst = reinterpret_cast<Pixel*>(
+        post_filter.GetSuperResBuffer(static_cast<Plane>(plane), 0, 0));
+    for (int y = 0; y < height[plane]; ++y) {
+      for (int x = 0; x < upscaled_width[plane]; ++x) {
+        output[y * upscaled_width[plane] + x] = dst[x];
+      }
+      dst += buffer_.stride(plane) / sizeof(Pixel);
+    }
+    const std::string digest = test_utils::GetMd5Sum(
+        output.data(), upscaled_width[plane] * height[plane] * sizeof(Pixel));
+    printf("MD5: %s\n", digest.c_str());
+    const char* expected_digest = nullptr;
+    switch (bitdepth) {
+      case 8:
+        expected_digest = GetSuperResDigest8bpp(id, plane);
+        break;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      case 10:
+        expected_digest = GetSuperResDigest10bpp(id, plane);
+        break;
+#endif
+    }
+    ASSERT_NE(expected_digest, nullptr);
+    EXPECT_STREQ(digest.c_str(), expected_digest);
+  }
+}
+
+using PostFilterSuperResTest8bpp = PostFilterSuperResTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamSuperRes[] = {
+    FrameSizeParam(176, 352, 288, 1, 1)};
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperRes) {
+  TestApplySuperRes(true, 0, 0, false);
+  TestApplySuperRes(true, 1, 1, false);
+  TestApplySuperRes(true, 128, 2, false);
+  TestApplySuperRes(true, 255, 3, false);
+  TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest8bpp, ApplySuperResThreaded) {
+  TestApplySuperRes(true, 0, 0, true);
+  TestApplySuperRes(true, 1, 1, true);
+  TestApplySuperRes(true, 128, 2, true);
+  TestApplySuperRes(true, 255, 3, true);
+  TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+                         PostFilterSuperResTest8bpp,
+                         testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest8bpp = PostFilterHelperFuncTest<8, uint8_t>;
+
+const FrameSizeParam kTestParamExtendFrame[] = {
+    FrameSizeParam(16, 16, 16, 1, 1),
+    FrameSizeParam(64, 64, 64, 1, 1),
+    FrameSizeParam(128, 128, 64, 1, 1),
+    FrameSizeParam(64, 64, 128, 1, 1),
+    FrameSizeParam(352, 352, 288, 1, 1),
+    FrameSizeParam(720, 720, 480, 1, 1),
+    FrameSizeParam(1080, 1080, 720, 1, 1),
+    FrameSizeParam(16, 16, 16, 0, 0),
+    FrameSizeParam(64, 64, 64, 0, 0),
+    FrameSizeParam(128, 128, 64, 0, 0),
+    FrameSizeParam(64, 64, 128, 0, 0),
+    FrameSizeParam(352, 352, 288, 0, 0),
+    FrameSizeParam(720, 720, 480, 0, 0),
+    FrameSizeParam(1080, 1080, 720, 0, 0)};
+
+TEST_P(PostFilterHelperFuncTest8bpp, ExtendFrame) {
+  TestExtendFrame(true, 0);
+  TestExtendFrame(true, 1);
+  TestExtendFrame(true, 128);
+  TestExtendFrame(true, 255);
+  TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+                         PostFilterHelperFuncTest8bpp,
+                         testing::ValuesIn(kTestParamExtendFrame));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterSuperResTest10bpp = PostFilterSuperResTest<10, uint16_t>;
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperRes) {
+  TestApplySuperRes(true, 0, 0, false);
+  TestApplySuperRes(true, 1, 1, false);
+  TestApplySuperRes(true, 1 << 9, 2, false);
+  TestApplySuperRes(true, (1 << 10) - 1, 3, false);
+  TestApplySuperRes(false, 0, 4, false);
+}
+
+TEST_P(PostFilterSuperResTest10bpp, ApplySuperResThreaded) {
+  TestApplySuperRes(true, 0, 0, true);
+  TestApplySuperRes(true, 1, 1, true);
+  TestApplySuperRes(true, 1 << 9, 2, true);
+  TestApplySuperRes(true, (1 << 10) - 1, 3, true);
+  TestApplySuperRes(false, 0, 4, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterSuperResTestInstance,
+                         PostFilterSuperResTest10bpp,
+                         testing::ValuesIn(kTestParamSuperRes));
+
+using PostFilterHelperFuncTest10bpp = PostFilterHelperFuncTest<10, uint16_t>;
+
+TEST_P(PostFilterHelperFuncTest10bpp, ExtendFrame) {
+  TestExtendFrame(true, 0);
+  TestExtendFrame(true, 1);
+  TestExtendFrame(true, 255);
+  TestExtendFrame(true, (1 << 10) - 1);
+  TestExtendFrame(false, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterHelperFuncTestInstance,
+                         PostFilterHelperFuncTest10bpp,
+                         testing::ValuesIn(kTestParamExtendFrame));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+namespace {
+
+const char* GetDigestApplyCdef8bpp(int id) {
+  static const char* const kDigest[] = {
+      "9593af24f9c6faecce53437f6e128edf", "ecb633cc2ecd6e7e0cf39d4439f4a6ea",
+      "9ec4cb4124f0a686a7bda72b447f5b8e", "7ebd859a23162bc864a69dbea60bc687",
+      "de7a15fc00664692a794aa68cf695980", "cf3fc8fe041f68d31ab4e34ad3643541",
+      "94c116b191b0268cf7ab4a0e6996e1ec", "1ad60c943a5a914aba7bc26706620a05",
+      "ce33c6f80e3608c4d18c49be2e393c20", "e140586ffc663798b74b8f6fb5b44736",
+      "b7379bba8bcb97f09a74655f4e0eee91", "02ce174061c98babd3987461b3984e47",
+      "64655dd1dfba8317e27d2fdcb211b7b4", "eeb6a61c70c5ee75a4c31dc5099b4dfb",
+      "ee944b31148fa2e30938084f7c046464", "db7b63497750fa4c51cf45c56a2da01c",
+  };
+  return kDigest[id];
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+const char* GetDigestApplyCdef10bpp(int id) {
+  static const char* const kDigest[] = {
+      "53f8d68ac7f3aea65151b2066f8501c9", "021e70d5406fa182dd9713380eb66d1d",
+      "bab1c84e7f06b87d81617d2d0a194b89", "58e302ff0522f64901909fb97535b270",
+      "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+      "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+      "5ff95a6a798eadc7207793c03d898ce4", "1483d28cc0f1bfffedd1128966719aa0",
+      "6af5a36890b465ae962c2878af874f70", "bd1ed4a2ff09d323ab98190d1805a010",
+      "6f0299645cd6f0655fd26044cd43a37c", "56d7febf5bbebdc82e8f157ab926a0bb",
+      "f54654f11006453f496be5883216a3bb", "9abc6e3230792ba78bcc65504a62075e",
+  };
+  return kDigest[id];
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+template <int bitdepth, typename Pixel>
+class PostFilterApplyCdefTest : public testing::TestWithParam<FrameSizeParam>,
+                                public test_utils::MaxAlignedAllocable {
+ public:
+  PostFilterApplyCdefTest() = default;
+  PostFilterApplyCdefTest(const PostFilterApplyCdefTest&) = delete;
+  PostFilterApplyCdefTest& operator=(const PostFilterApplyCdefTest&) = delete;
+  ~PostFilterApplyCdefTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(bitdepth);
+    dsp::CdefInit_C();
+    dsp::CdefInit_SSE4_1();
+    dsp::CdefInit_NEON();
+
+    dsp_ = dsp::GetDspTable(bitdepth);
+    ASSERT_NE(dsp_, nullptr);
+  }
+
+  // Sets sequence_header_, frame_header_, cdef_index_ and cdef_skip_.
+  // Allocates yuv_buffer_ but does not set it.
+  void SetInput(libvpx_test::ACMRandom* rnd);
+  // Sets yuv_buffer_.
+  void SetInputBuffer(libvpx_test::ACMRandom* rnd, PostFilter* post_filter);
+  void CopyFilterOutputToDestBuffer();
+  void TestMultiThread(int num_threads);
+
+  ObuSequenceHeader sequence_header_;
+  ObuFrameHeader frame_header_ = {};
+  FrameScratchBuffer frame_scratch_buffer_;
+  YuvBuffer yuv_buffer_;
+  const dsp::Dsp* dsp_;
+  FrameSizeParam param_ = GetParam();
+  Pixel dest_[kMaxTestFrameSize * kMaxPlanes];
+  const size_t y_size_ = param_.width * param_.height;
+  const size_t uv_size_ = y_size_ >>
+                          (param_.subsampling_x + param_.subsampling_y);
+  const size_t size_ = y_size_ + uv_size_ * 2;
+};
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInput(
+    libvpx_test::ACMRandom* rnd) {
+  sequence_header_.color_config.bitdepth = bitdepth;
+  sequence_header_.color_config.subsampling_x = param_.subsampling_x;
+  sequence_header_.color_config.subsampling_y = param_.subsampling_y;
+  sequence_header_.color_config.is_monochrome = false;
+  sequence_header_.use_128x128_superblock =
+      static_cast<bool>(rnd->Rand16() & 1);
+
+  ASSERT_TRUE(param_.width <= param_.upscaled_width);
+  ASSERT_TRUE(param_.upscaled_width * param_.height <= kMaxTestFrameSize)
+      << "Please adjust the max frame size.";
+
+  frame_header_.width = param_.width;
+  frame_header_.upscaled_width = param_.upscaled_width;
+  frame_header_.height = param_.height;
+  frame_header_.columns4x4 = DivideBy4(Align(frame_header_.width, 8));
+  frame_header_.rows4x4 = DivideBy4(Align(frame_header_.height, 8));
+  frame_header_.tile_info.tile_count = 1;
+  frame_header_.refresh_frame_flags = 0;
+  Cdef* const cdef = &frame_header_.cdef;
+  const int coeff_shift = bitdepth - 8;
+  do {
+    cdef->damping = (rnd->Rand16() & 3) + 3 + coeff_shift;
+    cdef->bits = rnd->Rand16() & 3;
+  } while (cdef->bits <= 0);
+  for (int i = 0; i < (1 << cdef->bits); ++i) {
+    cdef->y_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+    cdef->y_secondary_strength[i] = rnd->Rand16() & 3;
+    if (cdef->y_secondary_strength[i] == 3) {
+      ++cdef->y_secondary_strength[i];
+    }
+    cdef->y_secondary_strength[i] <<= coeff_shift;
+    cdef->uv_primary_strength[i] = (rnd->Rand16() & 15) << coeff_shift;
+    cdef->uv_secondary_strength[i] = rnd->Rand16() & 3;
+    if (cdef->uv_secondary_strength[i] == 3) {
+      ++cdef->uv_secondary_strength[i];
+    }
+    cdef->uv_secondary_strength[i] <<= coeff_shift;
+  }
+
+  const int rows64x64 = DivideBy16(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+  const int columns64x64 =
+      DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+  ASSERT_TRUE(frame_scratch_buffer_.cdef_index.Reset(rows64x64, columns64x64));
+  for (int row = 0; row < rows64x64; ++row) {
+    for (int column = 0; column < columns64x64; ++column) {
+      frame_scratch_buffer_.cdef_index[row][column] =
+          rnd->Rand16() & ((1 << cdef->bits) - 1);
+    }
+  }
+
+  const int skip_rows = DivideBy2(frame_header_.rows4x4 + kMaxBlockHeight4x4);
+  const int skip_columns =
+      DivideBy16(frame_header_.columns4x4 + kMaxBlockWidth4x4);
+  ASSERT_TRUE(frame_scratch_buffer_.cdef_skip.Reset(skip_rows, skip_columns));
+  for (int row = 0; row < skip_rows; ++row) {
+    memset(frame_scratch_buffer_.cdef_skip[row], 0xFF, skip_columns);
+  }
+
+  ASSERT_TRUE(yuv_buffer_.Realloc(
+      sequence_header_.color_config.bitdepth,
+      sequence_header_.color_config.is_monochrome, frame_header_.upscaled_width,
+      frame_header_.height, sequence_header_.color_config.subsampling_x,
+      sequence_header_.color_config.subsampling_y, kBorderPixels, kBorderPixels,
+      kBorderPixels, kBorderPixels, nullptr, nullptr, nullptr))
+      << "Failed to allocate source buffer.";
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::SetInputBuffer(
+    libvpx_test::ACMRandom* rnd, PostFilter* post_filter) {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+    const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+    const int plane_width =
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+    const int plane_height =
+        MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+    auto* src =
+        reinterpret_cast<Pixel*>(post_filter->GetUnfilteredBuffer(plane));
+    const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+    for (int y = 0; y < plane_height; ++y) {
+      for (int x = 0; x < plane_width; ++x) {
+        src[x] = rnd->Rand16() & ((1 << bitdepth) - 1);
+      }
+      src += src_stride;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::CopyFilterOutputToDestBuffer() {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    const int subsampling_x = (plane == 0) ? 0 : param_.subsampling_x;
+    const int subsampling_y = (plane == 0) ? 0 : param_.subsampling_y;
+    const int plane_width = SubsampledValue(param_.width, subsampling_x);
+    const int plane_height = SubsampledValue(param_.height, subsampling_y);
+    auto* src = reinterpret_cast<Pixel*>(yuv_buffer_.data(plane));
+    const int src_stride = yuv_buffer_.stride(plane) / sizeof(src[0]);
+    Pixel* dest_plane =
+        dest_ +
+        ((plane == 0) ? 0 : ((plane == 1) ? y_size_ : y_size_ + uv_size_));
+    for (int y = 0; y < plane_height; ++y) {
+      for (int x = 0; x < plane_width; ++x) {
+        dest_plane[y * plane_width + x] = src[x];
+      }
+      src += src_stride;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void PostFilterApplyCdefTest<bitdepth, Pixel>::TestMultiThread(
+    int num_threads) {
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  SetInput(&rnd);
+
+  ASSERT_TRUE(frame_scratch_buffer_.threading_strategy.Reset(frame_header_,
+                                                             num_threads));
+  if (num_threads > 1) {
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header_.rows4x4, 4));
+    ASSERT_TRUE(frame_scratch_buffer_.cdef_border.Realloc(
+        bitdepth, /*is_monochrome=*/false,
+        MultiplyBy4(frame_header_.columns4x4), num_units,
+        sequence_header_.color_config.subsampling_x,
+        /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+        kBorderPixels, nullptr, nullptr, nullptr));
+  }
+
+  PostFilter post_filter(frame_header_, sequence_header_,
+                         &frame_scratch_buffer_, &yuv_buffer_, dsp_,
+                         /*do_post_filter_mask=*/0x02);
+  SetInputBuffer(&rnd, &post_filter);
+
+  const int id = GetIdFromInputParam(param_.subsampling_x, param_.subsampling_y,
+                                     param_.height);
+  absl::Duration elapsed_time;
+  const absl::Time start = absl::Now();
+
+  // Only ApplyCdef() and frame copy inside ApplyFilteringThreaded() are
+  // triggered, since we set the filter mask to 0x02.
+  post_filter.ApplyFilteringThreaded();
+  elapsed_time += absl::Now() - start;
+
+  CopyFilterOutputToDestBuffer();
+  if (bitdepth == 8) {
+    test_utils::CheckMd5Digest(kCdef, kApplyCdefName,
+                               GetDigestApplyCdef8bpp(id), dest_, size_,
+                               elapsed_time);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  } else {
+    test_utils::CheckMd5Digest(kCdef, kApplyCdefName,
+                               GetDigestApplyCdef10bpp(id), dest_, size_,
+                               elapsed_time);
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  }
+}
+
+const FrameSizeParam kTestParamApplyCdef[] = {
+    FrameSizeParam(352, 352, 288, 0, 0),    FrameSizeParam(720, 720, 480, 0, 0),
+    FrameSizeParam(1920, 1920, 1080, 0, 0), FrameSizeParam(251, 251, 187, 0, 0),
+    FrameSizeParam(352, 352, 288, 0, 1),    FrameSizeParam(720, 720, 480, 0, 1),
+    FrameSizeParam(1920, 1920, 1080, 0, 1), FrameSizeParam(251, 251, 187, 0, 1),
+    FrameSizeParam(352, 352, 288, 1, 0),    FrameSizeParam(720, 720, 480, 1, 0),
+    FrameSizeParam(1920, 1920, 1080, 1, 0), FrameSizeParam(251, 251, 187, 1, 0),
+    FrameSizeParam(352, 352, 288, 1, 1),    FrameSizeParam(720, 720, 480, 1, 1),
+    FrameSizeParam(1920, 1920, 1080, 1, 1), FrameSizeParam(251, 251, 187, 1, 1),
+};
+
+using PostFilterApplyCdefTest8bpp = PostFilterApplyCdefTest<8, uint8_t>;
+
+TEST_P(PostFilterApplyCdefTest8bpp, ApplyCdef) {
+  TestMultiThread(2);
+  TestMultiThread(4);
+  TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+                         PostFilterApplyCdefTest8bpp,
+                         testing::ValuesIn(kTestParamApplyCdef));
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using PostFilterApplyCdefTest10bpp = PostFilterApplyCdefTest<10, uint16_t>;
+
+TEST_P(PostFilterApplyCdefTest10bpp, ApplyCdef) {
+  TestMultiThread(2);
+  TestMultiThread(4);
+  TestMultiThread(8);
+}
+
+INSTANTIATE_TEST_SUITE_P(PostFilterApplyCdefTestInstance,
+                         PostFilterApplyCdefTest10bpp,
+                         testing::ValuesIn(kTestParamApplyCdef));
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.cc b/src/prediction_mask.cc
new file mode 100644
index 0000000..ab4d849
--- /dev/null
+++ b/src/prediction_mask.cc
@@ -0,0 +1,236 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+  kWedgeHorizontal,
+  kWedgeVertical,
+  kWedgeOblique27,
+  kWedgeOblique63,
+  kWedgeOblique117,
+  kWedgeOblique153,
+};
+
+constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 4, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeHorizontal, 4, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}},
+                                              {{kWedgeOblique27, 4, 4},
+                                               {kWedgeOblique63, 4, 4},
+                                               {kWedgeOblique117, 4, 4},
+                                               {kWedgeOblique153, 4, 4},
+                                               {kWedgeHorizontal, 4, 2},
+                                               {kWedgeHorizontal, 4, 6},
+                                               {kWedgeVertical, 2, 4},
+                                               {kWedgeVertical, 6, 4},
+                                               {kWedgeOblique27, 4, 2},
+                                               {kWedgeOblique27, 4, 6},
+                                               {kWedgeOblique153, 4, 2},
+                                               {kWedgeOblique153, 4, 6},
+                                               {kWedgeOblique63, 2, 4},
+                                               {kWedgeOblique63, 6, 4},
+                                               {kWedgeOblique117, 2, 4},
+                                               {kWedgeOblique117, 6, 4}}};
+
+constexpr BitMaskSet kWedgeFlipSignMasks[9] = {
+    BitMaskSet(0xBBFF),  // kBlock8x8
+    BitMaskSet(0xBBEF),  // kBlock8x16
+    BitMaskSet(0xBAEF),  // kBlock8x32
+    BitMaskSet(0xBBEF),  // kBlock16x8
+    BitMaskSet(0xBBFF),  // kBlock16x16
+    BitMaskSet(0xBBEF),  // kBlock16x32
+    BitMaskSet(0xABEF),  // kBlock32x8
+    BitMaskSet(0xBBEF),  // kBlock32x16
+    BitMaskSet(0xBBFF)   // kBlock32x32
+};
+
+// This table (and the one below) contains a few leading zeros and trailing 64s
+// to avoid some additional memcpys where it is actually used.
+constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18, 37,
+    53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+    46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = {
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+    43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64};
+
+int BlockShape(BlockSize block_size) {
+  const int width = kNum4x4BlocksWide[block_size];
+  const int height = kNum4x4BlocksHigh[block_size];
+  if (height > width) return 0;
+  if (height < width) return 1;
+  return 2;
+}
+
+uint8_t GetWedgeDirection(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][0];
+}
+
+uint8_t GetWedgeOffsetX(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][1];
+}
+
+uint8_t GetWedgeOffsetY(BlockSize block_size, int index) {
+  return kWedgeCodebook[BlockShape(block_size)][index][2];
+}
+
+}  // namespace
+
+bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) {
+  // Generate master masks.
+  uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize];
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) {
+    memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift,
+           kWedgeMaskMasterSize);
+    memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift,
+           kWedgeMaskMasterSize);
+  }
+
+  for (int y = 0; y < kWedgeMaskMasterSize; ++y) {
+    for (int x = 0; x < kWedgeMaskMasterSize; ++x) {
+      const uint8_t mask_value = master_mask[kWedgeOblique63][y][x];
+      master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x];
+      master_mask[kWedgeOblique27][x][y] = mask_value;
+      master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] =
+          64 - mask_value;
+      master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] =
+          64 - mask_value;
+    }
+  }
+
+  // Generate wedge masks.
+  int block_size_index = 0;
+  for (int size = kBlock8x8; size <= kBlock32x32; ++size) {
+    if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue;
+
+    const int width = kBlockWidthPixels[size];
+    const int height = kBlockHeightPixels[size];
+    assert(width >= 8);
+    assert(width <= 32);
+    assert(height >= 8);
+    assert(height <= 32);
+
+    const auto block_size = static_cast<BlockSize>(size);
+    for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes;
+         ++wedge_index) {
+      const uint8_t direction = GetWedgeDirection(block_size, wedge_index);
+      const uint8_t offset_x =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3);
+      const uint8_t offset_y =
+          DivideBy2(kWedgeMaskMasterSize) -
+          ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3);
+
+      // Allocate the 2d array.
+      for (int flip_sign = 0; flip_sign < 2; ++flip_sign) {
+        if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset(
+                height, width, /*zero_initialize=*/false))) {
+          LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks.");
+          return false;
+        }
+      }
+
+      const auto flip_sign = static_cast<uint8_t>(
+          kWedgeFlipSignMasks[block_size_index].Contains(wedge_index));
+      uint8_t* wedge_masks_row =
+          (*wedge_masks)[block_size_index][flip_sign][wedge_index][0];
+      uint8_t* wedge_masks_row_flip =
+          (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0];
+      uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x];
+      for (int y = 0; y < height; ++y) {
+        memcpy(wedge_masks_row, master_mask_row, width);
+        for (int x = 0; x < width; ++x) {
+          wedge_masks_row_flip[x] = 64 - wedge_masks_row[x];
+        }
+        wedge_masks_row += width;
+        wedge_masks_row_flip += width;
+        master_mask_row += kWedgeMaskMasterSize;
+      }
+    }
+
+    block_size_index++;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/prediction_mask.h b/src/prediction_mask.h
new file mode 100644
index 0000000..827a0fa
--- /dev/null
+++ b/src/prediction_mask.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_
+#define LIBGAV1_SRC_PREDICTION_MASK_H_
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16,
+                                                 kBlock8x32, kBlock16x8,
+                                                 kBlock16x16, kBlock16x32,
+                                                 kBlock32x8, kBlock32x16,
+                                                 kBlock32x32);
+
+// This function generates wedge masks. It should be called only once for the
+// decoder. If the video is key frame only, we don't have to call this
+// function. Returns true on success, false on allocation failure.
+// 7.11.3.11.
+bool GenerateWedgeMask(WedgeMaskArray* wedge_masks);
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_PREDICTION_MASK_H_
diff --git a/src/prediction_mask_test.cc b/src/prediction_mask_test.cc
new file mode 100644
index 0000000..d2a12c2
--- /dev/null
+++ b/src/prediction_mask_test.cc
@@ -0,0 +1,214 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/prediction_mask.h"
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWedgeDirectionTypes = 16;
+
+enum kWedgeDirection : uint8_t {
+  kWedgeHorizontal,
+  kWedgeVertical,
+  kWedgeOblique27,
+  kWedgeOblique63,
+  kWedgeOblique117,
+  kWedgeOblique153,
+};
+
+const char* const kExpectedWedgeMask[] = {
+    "cea09e4bf4227efef749672283f7369b", "2763ab02b70447b2f9d5ed4796ca33bc",
+    "8d83c4315eadda824893c3e79aa866d9", "a733fd7f143c1c6141983c5f816bb3d8",
+    "9a205bfca776ccde57a8031350f2f467", "d78b964719f52f302f4454df14e45e35",
+    "bdc3972cfeb44d0acebb49b2fcb76072", "c8872571833c165be99ada1c552bfd9b",
+    "26d2541e2f8efe48e2f4a1819b3a6896", "783871179337e78e5ef41a66c0c6937c",
+    "253d21c612d732fceedcf610c4ff099c", "c868d177dc2a2378ef362fa482f601e8",
+    "782d75e143d87cc1aeb5d040c48d3c2d", "718cbecf4db45c7d596eba07bd956601",
+    "3b60b9336c2cf699172eb4a3fef18787", "afe72d4bd206f1cb27e3736c3b0068cf",
+    "7b830a1a94bad23a1df1b8d9668708d0", "d3f421ff2b81686fd421f7c02622aac1",
+    "d9ac14dff8e3c415e85e99c3ce0fbd5b", "da493727a08773a950a0375881d912f2",
+    "2f4251fd1b4636a034e22611ea1223b6", "84f84f01900b8a894b19e353605846b0",
+    "bbf5dae73300b6a6789710ffc4fc59fd", "c711941a0889fbed9b926c1eb39a5616",
+    "2fcf270613df57a57e647f37bf9a19ec", "79ed9c2f828b765edf65027f1f0847f5",
+    "e8d3e821f4e7f2f39659071da8f2cc71", "823bb09e2c28f2a81bf8a2d030e8bab6",
+    "d598fb4f70ea6b705674497994aecbfa", "3737c39f058c57650be7e720dcd87aa1",
+    "eb1d9b1d30485d9870ca9380cbdfad43", "a23d3c24f291080fcd62c0a2a2aea181",
+    "968543d91aeae3b1814a5074b6aa9e8c", "6e2444d71a4f3ddfe643e72f9c3cf6c3",
+    "3bf78413aa04830849a3d9c7bfa41a84", "ece8306f9859bcfb042b0bda8f6750b6",
+    "608b29fcedb7fa054a599945b497c78c", "d69d622016872469dfbde4e589bfd679",
+    "38a2307174c27b634323c59da3339dc6", "5e44f0fad99dbe802ffd69c7dc239d56",
+    "a0eeaf3755a724fdf6469f43cb060d75", "7bcf8035c5057619ea8660c32802d6a1",
+    "6054e1c35fe13b9269ab01d1bc0d8848", "e0ec8f7c66ebabff60f5accd3d707788",
+    "0b9fd6e1053a706af5d0cd59dc7e1992", "709648ffab1992d8522b04ca23de577a",
+    "c576e378ed264d6cb00adfd3b4e428f1", "f6f3ae5348e7141775a8a6bc2be22f80",
+    "9289722adb38fa3b2fb775648f0cc3a8", "b7e02fa00b56aeea8e6098a92eac72e1",
+    "db2f6d66ffca8352271f1e3f0116838a", "5858c567b0719daaa364fb0e6d8aa5dc",
+    "db2d300f875d2465adabf4c1322cea6f", "05c66b54c4d32e5b64a7e77e751f0c51",
+    "f2c2a5a3ce510d21ef2e62eedba85afb", "3959d2191a11e800289e21fd283b2837",
+    "cc86023d079a4c5daadce8ad0cdd176f", "e853f3c6814a653a52926488184aae5e",
+    "8568b9d7215bb8dfb1b7ce66ef38e055", "42814ac5ed652afb4734465cca9e038c",
+    "dba6b7d5e93e6a20dac9a514824ad45c", "be77e0dce733b564e96024ea23c9db43",
+    "2aa7bd75a1d8eb1000f0ef9e19aa0d1d", "226d85741e3f35493e971dd13b689ec7",
+    "9e5a0cf4416f8afeaa3ddbe686b5b7db", "18389c77b362f6b4b727b99426251159",
+    "10c5d899de999bbdf35839be3f2d5ee3", "942ae479a36fb4b4d359bebd78a92f03",
+    "f14e4dd174958e16755cd1f456b083e0", "8a036cbd0aaf1bece25a1140109f688b",
+    "2e48eade95f9fa0b7dae147e66d83e13", "4387d723350a011e26b0e91bbeb3d7c2",
+    "5470f977d859232335945efc8bb49ff1", "6780fd81cf2561300c75c930e715c7a6",
+    "9786aca6b1b9abfc3eae51404bc3cbd5", "da65c1440fa370a0237284bf30e56b0b",
+    "8e0d5d83ab3c477fd11ef143a832f7bf", "97489c7a47aa69fef091e7e6e4049a8f",
+    "28787beac9e69001c2999976742764a3", "67760c48ff5f7bc50cd92727694ba271",
+    "57c2b0b7de5de0f40fb739ed095d82a4", "7b2a663ca7da4b73f1adfc7e0ca1eff1",
+    "980869e1795efb63ca623ce2f0043fb3", "575497eb213b05bab24017cc6ea4e56a",
+    "ca3b31382439f0bdd87b61fa10c7863b", "72c65bf29afb288f4d4ff51816429aa7",
+    "1fe8929387be982993cd2309e3eeae7a", "994246e2585179e00f49537713f33796",
+    "82ae324ba01002370e918724ce452738", "fb3bcb4811b8251f0cc5ec40859617e7",
+    "a2e24b21c1d3661412e00411d719210c", "7adc2b60d7d62df1d07e3e4458a46dc2",
+    "e71c1b2f9ccb1af0868c3869dc296506", "3e33e087c7e6f724528abbc658a1b631",
+    "19b80d80f6b83eedac4bab6226865ae1", "7d9293641c4ed3b21c14964ec785cfb9",
+    "5dd0fb9700f30c25bf7b65367c8f098d", "f96b55ec2d012807c972ef4731acd73d",
+    "5fc70808c3fa5b3c511926b434bfba66", "768c3ce37acfcd4e5ba05152e5710bc9",
+    "1271a52682566ebfc01d5c239177ffd4", "52d4fc11a7507695b2548e0424be50ab",
+    "729e7d421aaaf74daa27b0ce1ca0a305", "92d2ff4a9a679cdf0ff765a2d30bced1",
+    "d160ec6f1bd864eb2ac8fabf5af7fedd", "ad323dbcb4a651e96bd5c81bc185385d",
+    "937c1b7106a2e6aef0adf2c858b4df18", "0f9ad42d1c48970f8462921ac79849ee",
+    "32ed1e1a16ddbf816f81caca7cb56c93", "e91aa6389d8255b7744aaa875ba2ceec",
+    "88f9dedf6d565b2f60b511e389cf366a", "d0428fd42ca311cd3680ff4670d4f047",
+    "b9c7eeb7c9733f0220587643952602cb", "65adf32a5e03d161a411815179078ba3",
+    "4984a4e9a5bdf732c071d5b60029daf4", "b9b65a2a9f04b59766d305221e4cda5a",
+    "7b2d372fe33d6db1fcf75820b7523ed5", "9a07593316707f8e59fe09c7647ade15",
+    "33e75e0d2aa73e3410095c2f98c27a14", "f9ddb33b16431ff9cf6ae96dd4acc792",
+    "2df1a8655b2ef23f642b11b76b20f557", "9faba399ccf555c25a33c336cdd54d94",
+    "c94404e263c2dae2e955ead645348c08", "3d16d4be87cd4467c3f7be17287940c8",
+    "99d0fdae81d61680c7a5b1df38dc98fc", "a23b402d699a00c5c349b17e77f73552",
+    "c6f76c81c4050939a6bd5d30ca00b307", "bc3d035bd6e8f55497bfc6d1f81fc8be",
+    "99b10db073e13b49bd90655f7516383b", "ddfd0e434efe076e2706c5669c788566",
+    "e1d836f814e6eca80ef530f8676e0599", "ed3e4c64e9fd1006e0016e460970a423",
+    "0282542e21fa0dea0bf48ec0a2d25b2d", "7482eb8a7bf1417a61c21d82bc7c95f9",
+    "e98e9bb3d5edf7b943d0bbf1eec9bef6", "ad4d313beecf609ff3a7d30da3e54a1d",
+    "b98f8db9fa62fb73d26415f6fa31b330", "0591b3c34bf4750f20a74eee165a54bd",
+    "3054b56fec6968255f21d40f80f5121c", "59ecf60cbb8408e042816e73446fa79c",
+    "8fa8c996209a1ddb8a00c14ca19953f8", "e20d2462bc43a1a1bfbc5efe7a905666",
+    "b5065e40d5d103e21daabcf4d5fea805", "b65aba0f8e307ef08951f1abdb7c8f62",
+    "5fbec6e57c1c651bd7be69fccb0b39a6", "9dfc362f7212d086418b0def54a7c76c",
+    "6644928e9aaac5e5d64f4a2c437c778a", "1bf63c7539ea32489bec222d5bc5305f",
+    "755ec607a5edf116d188353a96a025c3", "bdc4cc354c4f57c38d3be3dbc9380e2d",
+    "7851752b4ae36793ab6f03cd91e7ba6f", "99b9834ea2f6ea8d9168c5c1ba7fe790",
+    "75a155c83b618b28d48f5f343cdfef62", "38821c97e04d2294766699a6846fefaf",
+    "14be7f588461273862c9d9b83d2f6f0a", "8c38ce521671f0eee7e6f6349ef4f981",
+    "043347de994f2fe68c08e7c06a7f6735", "cda15ea2caccbdd8a7342a6144278578",
+    "244d586e88c9d6a9a59059a82c3b8e57", "3712928dd0dd77f027370f22d61366a0",
+    "e4f1cd4785fc331ad6e3100da4a934f3", "3181459434921b5b15b64cfd2ee734c4",
+    "2d588831e98c7178c5370421a6f2fc60", "135cf6a67fc1b51dbcf9fcddb3ae1237",
+    "d701da4e1a890a37bb0e9af4a2f0b048", "02138b5a4882181f248945c3a8262050",
+    "7fbd4d06965b1d152d6c037b0302f307", "7917a20573da241868689ed49c0d5972",
+    "ffdd4257d91fe00e61de4d2668f1ee07", "72999b6d3bf1ee189e9269a27105991f",
+    "1b63d7f25388c9af4adac60d46b7a8ca", "e3ce0977224197ade58aa979f3206d68",
+    "73178ffd388b46891fc4a0440686b554", "f1f99faf52cea98c825470c6edd1d973",
+    "e6fae5d5682862ec3377b714b6b69825", "a4f96cca8da155204b0cc4258b068d3c",
+    "75c7674c2356325dcb14c222266c46f8", "932b23521c9d9d06096879a665a14e28",
+    "8ed48a84a99b4a5bf2ec8a7a2c1f1c79", "4f6f0214857a92ad92eca1c33a762424",
+    "34865190c3e91200a0609a6e770ebc5c", "e793f1f2e46876b1e417da5d59475fda",
+    "e83cd9a228941a152f6878aa939e1290", "d6f5cd74ba386bd98282e1fcb0528dbd",
+    "131b55ec66ffe76f9088f7b35d38c0dd", "2d0ae8ee059cbd8c7816e3c862efdf37",
+    "65baadd2cb85ffbc6480bf8c1f128d1a", "2b8e8af333c464b4213bbd9185a9b751",
+    "951fd5faed77a1ae9bf5ef8f30bd65c3", "41d38d40dfe9da2b9ff2146711bf6ab5",
+    "7430bde28aed5a9429db54ea663a5e26", "46576d59a13756c494793ad4b3a663e5",
+    "21802d0db30caa44cbdba2ac84cc49b5", "591cad82ae106d9e9670acd5b60e4548",
+    "c0484c58c6c009939e7f3ec0c1aa8e2d", "6405c55d0a1830cfdd37950bfd65fd6f",
+    "3bd74c067d2ba027fc004e9bf62254db", "6e920e6dbdbe55a97ff2bf3dfb38a3e0",
+    "e2ed20f89da293516b14be766a624299", "0a613ee53ec38cad995faa17a24fcb8f",
+    "0de937145c030d766c3f9fff09d7e39c", "4a560325b804fcb6643866e971ade8e8",
+    "be82c41d3a0f8bd4032c3e5e45b453da", "b27219f02db167bf5a416831b908b031",
+    "7cf5437e25d362bc373dd53d8fd78186", "39c801e28cc08150c2016083113d1a03",
+    "785a21219d9c42a7c5bd417f365535a3", "008c79298a87837bcb504c4dc39ca628",
+    "af24d1d6f4d3ee94f2af52471a64ca1f", "cd82218aae9815c106336aec7ce18833",
+    "9f405c66d4ce7533213c4ca82feaf252", "7ceda4ea6ddeccd04dbf6d3237fe956a",
+    "ae21b52869b85a64fa4e3a85a2a8bb8d", "a004927cdbf48e0dafcccfb6066cdd0c",
+    "949337a963a8a5c0f46cf774b078a7cd", "24f58b8db17d02f66d04d22ca6c5e026",
+    "2b1315a2e7c5d5309a7621651e741616", "5b317ef820e6c8e7ea7a7d7022e8349d",
+    "debd504650d35d9deca4c2461094949f", "19d0ca33e5b3a0afff1f39f0f42238e0",
+    "df1c6c7582bfa5ceb147a8dd253cfa43", "176647077c5e2d985b3807134aac118f",
+    "dd2850172602688eaaa768f705c1ba67", "6ba1a3929ae9725fc688b8189b16314f",
+    "639189abb754dfa6be3c813ee8342954", "d5d1b8bff370f280fba13827d6bdf0fb",
+    "4b0ad4ea387a952724cab42730f712d2", "8c9c1f09946b61315e9a45c7e39f1992",
+    "50ef75c2b7a17f972586ce053eb62d24", "d5922dd01d8d02ca00ab9648a3db343f",
+    "091f517b18f4438ea9c581b7471f2fc0", "fede855bfb936caaa8fb4a434adac1d3",
+    "081b612f810f38c5ff6dc1cd03bf2eb6", "bd10e764eaf7d7e0ec89de96423d0afe",
+    "3e64cb1355e05b0a4b0237fae3f33bb2", "7cb92e0ecc0dd06d0a5d248efba48630",
+    "ec875f2e155a2e124ef52bf35e9a876c", "15529c83eae41bfa804f2c386f480e90",
+    "ee0e59567874155fb54de63fc901ded7", "4ad160b0d0f5166f9cddf7235725406e",
+    "176b64b3883c33e2aa251159983ccaa1", "d9cca01946d2a47c0114b1f49e4d688f",
+    "73d706a13afa279d9c716b3ba3a2ed68", "dea5a7f010d2f1385fe2b7d1d36aafb0",
+    "b5432fbc22d2f96c1230cc33178da09e", "8b0e7399ce98b68de4048411ab649468",
+    "3d52c986a5a5852a4620fbb38259a109", "eb61882738fefdd105094d4c104cf8b0",
+    "24fbc0d3ee28e937cfa1a3fbbc4e8214", "c69eb0687e477c27ac0d4c5fe54bbe8b",
+    "00a4f498f05b2b348252927ecc82c8a3", "c76471a61250be52e8d5933e582b1e19",
+    "22ebb8812dd795fdc14f20a7f9f89844", "f7c7d5c04bc234545726f4b116b623ec",
+    "9fc323d6619af0101edfacb4e9c2b647", "902d7888215d6aac1cf41f1fb6a916d8",
+    "5817d80a0504a5b08627502aeece4f38", "a1afa4b4065c143bc4857e364cec7f3d",
+    "506d5a6ff434411ea893bb2dc021aa25", "31cd3ca39015ccee1e217e1c83fff2a0",
+    "eb1ed4ef292c7d8fead1f113c9fd998f", "35f3abf3a056b778e3d7885f8df6c07a",
+    "299d71ee557382f5e64f26f1a8e4e156", "12f8c591a4e257bcc26b385424cd8d47",
+    "0b273b03d817af587c8fb23de71f346d", "1d7592fe89c661e9f61d215d235aa2ee",
+    "331dc544956ee14064ab432c85d52828", "a0a4ccbe1c442717ad40b7d40ed81a40",
+    "45009d915bf1d4ab855b5b670d314839", "641dfe93841aaa18888cebb17b8566eb",
+    "2b177c880ce0c2b4e891abc1dc23dfc2", "23984491f7d6c206fb8babafc9aacfdb",
+    "5841b93edb22c702035e31b26c58a728", "9852506766cb47f48783640d14753089",
+    "8a43698d32f63b1e7191482e4b274fc3", "7bdef02623beae507a651ad398422876",
+    "b105138645ad27657a08a3a8e8871a7e", "913e40ebbf1b983ca4956b85364b9459",
+    "5776f97b4f0cfa435a99d5d90822922d", "a0ae92a24c2b20039d996ee2a7d8b107",
+    "a925cc792412e2a7abe89367c9fe28b1", "778183eab5c9e0ee559d828d8347a21c",
+    "c4b4777355a4c8e8858faec37ba23eec", "4cdd41c3648e8d05c3e8f58d08385f8b",
+    "7c1246737874f984feb1b5827a1f95db", "c75d766ff5af8db39d400962d5aba0b4",
+    "964f010f5aa6748461ca5573b013091d", "b003f3eab3b118e5a8a85c1873b3bb55"};
+
+TEST(WedgePredictionMaskTest, GenerateWedgeMask) {
+  WedgeMaskArray wedge_masks;
+  ASSERT_TRUE(GenerateWedgeMask(&wedge_masks));
+
+  // Check wedge masks.
+  int block_size_index = 0;
+  int index = 0;
+  for (int block_size = kBlock8x8; block_size < kMaxBlockSizes; ++block_size) {
+    const int width = kBlockWidthPixels[block_size];
+    const int height = kBlockHeightPixels[block_size];
+    if (width < 8 || height < 8 || width > 32 || height > 32) continue;
+
+    for (int flip_sign = 0; flip_sign <= 1; ++flip_sign) {
+      for (int direction = 0; direction < kWedgeDirectionTypes; ++direction) {
+        uint8_t* const block_wedge_mask =
+            wedge_masks[block_size_index][flip_sign][direction][0];
+        const std::string digest =
+            test_utils::GetMd5Sum(block_wedge_mask, width * height);
+        EXPECT_STREQ(digest.c_str(), kExpectedWedgeMask[index]);
+        index++;
+      }
+    }
+    block_size_index++;
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/quantizer.cc b/src/quantizer.cc
new file mode 100644
index 0000000..cd720d6
--- /dev/null
+++ b/src/quantizer.cc
@@ -0,0 +1,269 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
+#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
+#endif
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
+// Format the kDcLookup and kAcLookup arrays manually for easier comparison
+// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
+
+// clang-format off
+constexpr int16_t kDcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16,
+    17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26,
+    27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37,
+    38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47,
+    48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57,
+    57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66,
+    67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+    77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85,
+    87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104,
+    105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121,
+    123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146,
+    148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174,
+    177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208,
+    211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+    250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292,
+    296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344,
+    349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406,
+    411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482,
+    489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590,
+    602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775,
+    796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+    1184, 1232, 1282, 1336
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34,
+    37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75,
+    78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120,
+    124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166,
+    170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212,
+    215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255,
+    259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297,
+    300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337,
+    343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412,
+    418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484,
+    490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584,
+    592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698,
+    708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+    844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988,
+    1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170,
+    1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379,
+    1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624,
+    1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929,
+    1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363,
+    2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+    3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559,
+    4737, 4929, 5130, 5347
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+};
+
+constexpr int16_t kAcLookup[][256] = {
+  // Lookup table for 8 bit.
+  {
+    4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+    55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+    67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+    79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+    91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+    104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+    128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+    152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185,
+    188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+    231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+    285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347,
+    353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432,
+    440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540,
+    550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676,
+    689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848,
+    864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066,
+    1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343,
+    1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692,
+    1725, 1759, 1793, 1828
+  },
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Lookup table for 10 bit.
+  {
+    4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37,
+    40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83,
+    88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136,
+    140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190,
+    195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244,
+    249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297,
+    302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349,
+    354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401,
+    409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498,
+    506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596,
+    604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737,
+    749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905,
+    922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+    1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+    1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727,
+    1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159,
+    2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703,
+    2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391,
+    3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264,
+    4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372,
+    5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768,
+    6900, 7036, 7172, 7312
+  },
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+};
+// clang-format on
+
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+               int src_height) {
+  const int dst_width = src_height;
+  const int dst_height = src_width;
+  Array2DView<const uint8_t> source(src_height, src_width, src);
+  Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+  for (int y = 0; y < dst_height; ++y) {
+    for (int x = 0; x < dst_width; ++x) {
+      dest[y][x] = source[x][y];
+    }
+  }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+  Array2DView<uint8_t> dest(size, size, dst);
+  int k = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x <= y; ++x) {
+      dest[y][x] = dest[x][y] = src[k++];
+    }
+  }
+}
+
+}  // namespace
+
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+  for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+    for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+         ++plane_type) {
+      auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+      // Notes about how these matrices are populated:
+      // * For square transforms, we store only the lower left triangle (it is
+      // symmetric about the main diagonal. So when populating the matrix, we
+      // will have to fill in the upper right triangle.
+      // * For rectangular transforms, the matrices are transposes when the
+      // width and height are reversed. So when populating we populate it with
+      // memcpy when w < h and populate it by transposing when w > h.
+      // * There is a special case for 16x16 where the matrix is the same as
+      // 32x32 with some offsets.
+      // * We use the "adjusted transform size" when using these matrices, so we
+      // won't have to populate them for transform sizes with one of the
+      // dimensions equal to 64.
+      for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+        if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+          continue;
+        }
+        const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+        if (!quantizer_matrix[tx_size].Resize(size)) {
+          return false;
+        }
+      }
+#define QUANTIZER_MEMCPY(W, H)                            \
+  memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+         kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H)                            \
+  Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+            kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE)                                \
+  FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+                    kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+      QUANTIZER_FILL_UPPER_TRIANGLE(4);   // 4x4
+      QUANTIZER_MEMCPY(4, 8);             // 4x8
+      QUANTIZER_MEMCPY(4, 16);            // 4x16
+      QUANTIZER_TRANSPOSE(8, 4);          // 8x4
+      QUANTIZER_FILL_UPPER_TRIANGLE(8);   // 8x8
+      QUANTIZER_MEMCPY(8, 16);            // 8x16
+      QUANTIZER_MEMCPY(8, 32);            // 8x32
+      QUANTIZER_TRANSPOSE(16, 4);         // 16x4
+      QUANTIZER_TRANSPOSE(16, 8);         // 16x8
+      QUANTIZER_MEMCPY(16, 32);           // 16x32
+      QUANTIZER_TRANSPOSE(32, 8);         // 32x8
+      QUANTIZER_TRANSPOSE(32, 16);        // 32x16
+      QUANTIZER_FILL_UPPER_TRIANGLE(32);  // 32x32
+      // 16x16.
+      Array2DView<uint8_t> dst16x16(
+          16, 16, quantizer_matrix[kTransformSize16x16].get());
+      Array2DView<const uint8_t> src32x32(
+          32, 32, quantizer_matrix[kTransformSize32x32].get());
+      for (int y = 0; y < 16; ++y) {
+        for (int x = 0; x < 16; ++x) {
+          dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+        }
+      }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+    }
+  }
+  return true;
+}
+
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
+  if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
+    const int segment_qindex =
+        base_qindex +
+        segmentation.feature_data[index][kSegmentFeatureQuantizer];
+    return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer);
+  }
+  return base_qindex;
+}
+
+Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params)
+    : params_(*params) {
+  assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH);
+  const int index = BitdepthToArrayIndex(bitdepth);
+  dc_lookup_ = kDcLookup[index];
+  ac_lookup_ = kAcLookup[index];
+}
+
+int Quantizer::GetDcValue(Plane plane, int qindex) const {
+  return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+int Quantizer::GetAcValue(Plane plane, int qindex) const {
+  return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer,
+                          kMaxQuantizer)];
+}
+
+}  // namespace libgav1
diff --git a/src/quantizer.h b/src/quantizer.h
new file mode 100644
index 0000000..c60756c
--- /dev/null
+++ b/src/quantizer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_QUANTIZER_H_
+#define LIBGAV1_SRC_QUANTIZER_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+using QuantizerMatrix = std::array<
+    std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+               kNumPlaneTypes>,
+    kNumQuantizerLevelsForQuantizerMatrix>;
+
+// Implements the dequantization functions of Section 7.12.2.
+class Quantizer {
+ public:
+  Quantizer(int bitdepth, const QuantizerParameters* params);
+
+  // Returns the quantizer value for the dc coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetDcValue(Plane plane, int qindex) const;
+
+  // Returns the quantizer value for the ac coefficient for the given plane.
+  // The caller should call GetQIndex() with Tile::current_quantizer_index_ as
+  // the |base_qindex| argument, and pass the return value as the |qindex|
+  // argument to this method.
+  int GetAcValue(Plane plane, int qindex) const;
+
+ private:
+  const QuantizerParameters& params_;
+  const int16_t* dc_lookup_;
+  const int16_t* ac_lookup_;
+};
+
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
+// Get the quantizer index for the |index|th segment.
+//
+// This function has two use cases. What should be passed as the |base_qindex|
+// argument depends on the use case.
+// 1. While parsing the uncompressed header or transform type, pass
+//    Quantizer::base_index.
+//    Note: In this use case, the caller only cares about whether the return
+//    value is zero.
+// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or
+//    Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_.
+int GetQIndex(const Segmentation& segmentation, int index, int base_qindex);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_QUANTIZER_H_
diff --git a/src/quantizer_tables.inc b/src/quantizer_tables.inc
new file mode 100644
index 0000000..34342c4
--- /dev/null
+++ b/src/quantizer_tables.inc
@@ -0,0 +1,3080 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the quantizer table
+// definitions from the quantizer functions.
+
+constexpr uint8_t kQuantizerMatrix4x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+        {{32,  42,  75, 91,  33,  42,  69,  86,  37,  58, 84,
+          91,  49,  71, 103, 110, 65,  84,  125, 128, 80, 97,
+          142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+         {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64,  48, 61, 75, 73,
+          54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+        {{32,  42,  69, 88, 33,  42,  64, 83,  36,  56, 77,
+          88,  46,  67, 93, 105, 60,  79, 112, 122, 75, 92,
+          130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+         {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+          52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+        {{32,  38,  62, 86, 32,  40,  58, 80, 34,  51, 68,
+          85,  44,  61, 85, 101, 54,  69, 98, 117, 72, 84,
+          118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+         {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+          50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+        {{32,  35,  59, 83, 32,  36,  57, 78, 34,  47, 65,
+          82,  41,  53, 78, 97,  51,  61, 92, 111, 65, 73,
+          108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+         {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+          49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+        {{32, 35, 51, 77,  32, 36, 50, 72,  34, 42, 54,  75,  38, 51, 67,  87,
+          48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+         {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+          47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+        {{32, 35, 51, 75, 32, 36, 50, 71,  34, 42, 54, 73,  37, 50, 65,  84,
+          45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+         {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+          46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+        {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58,  35, 43, 54, 68,
+          41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+         {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+          45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+        {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+          38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+         {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+          46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+        {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+          35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+         {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+          47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+        {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+          34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+         {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+          43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+          33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+         {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+          40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+        {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+          32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+          37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+         {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+          34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+        {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+        {{31,  44,  79,  96,  32,  41,  72,  90,  32,  42,  71,  86,  34,
+          48,  73,  83,  34,  54,  78,  89,  41,  63,  90,  95,  45,  67,
+          96,  102, 54,  75,  110, 111, 60,  79,  118, 123, 72,  90,  133,
+          135, 75,  92,  136, 149, 83,  100, 142, 160, 88,  100, 140, 173,
+          94,  101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+         {31, 49, 63, 69,  32, 45, 57, 65,  36, 46, 56, 62,  43, 49, 57, 60,
+          46, 53, 60, 63,  45, 58, 67, 66,  46, 59, 71, 70,  50, 62, 78, 74,
+          52, 64, 82, 80,  57, 67, 89, 85,  59, 68, 90, 91,  62, 71, 91, 96,
+          63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+        {{31,  44, 73,  93,  32,  41,  67,  87,  32,  42,  65,  83,  33,
+          44,  66, 81,  34,  54,  74,  86,  37,  58,  79,  92,  44,  66,
+          90,  98, 49,  71,  99,  107, 56,  77,  107, 117, 65,  84,  119,
+          129, 72, 90,  127, 141, 78,  95,  133, 151, 84,  95,  132, 163,
+          89,  95, 136, 169, 95,  101, 132, 175, 101, 108, 141, 183},
+         {31, 49, 61, 69, 32, 45, 55, 64,  36, 46, 54, 61,  41, 47, 54, 59,
+          46, 53, 59, 62, 46, 56, 62, 65,  46, 59, 68, 68,  48, 61, 73, 73,
+          51, 63, 77, 78, 54, 65, 82, 84,  57, 67, 86, 89,  60, 69, 88, 93,
+          62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+        {{31,  39, 65,  90,  32,  38,  60,  84,  32,  39,  59,  81,  33,
+          40,  58, 78,  34,  47,  65,  83,  37,  54,  73,  89,  41,  58,
+          79,  94, 46,  62,  86,  102, 53,  68,  97,  112, 60,  73,  105,
+          123, 65, 78,  111, 134, 74,  85,  120, 143, 79,  90,  125, 154,
+          84,  90, 128, 158, 89,  95,  124, 164, 94,  101, 131, 170},
+         {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+          44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+          49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+          60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+        {{31,  36, 62,  88,  32,  35, 58,  82,  32,  36,  57,  79,  33,
+          38,  56, 76,  34,  42,  61, 81,  34,  48,  66,  85,  39,  51,
+          74,  91, 44,  56,  82,  98, 49,  60,  90,  107, 54,  63,  95,
+          117, 60, 68,  102, 127, 68, 75,  110, 135, 75,  81,  117, 145,
+          79,  85, 120, 148, 84,  89, 116, 153, 88,  94,  123, 159},
+         {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+          43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+          48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+          58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+        {{31,  36, 53,  81,  32,  35, 51,  76,  32,  35, 49,  73,  32,
+          37,  49, 71,  33,  41,  53, 74,  34,  48,  60, 80,  37,  50,
+          65,  85, 41,  53,  71,  91, 45,  56,  76,  98, 49,  60,  82,
+          105, 54, 63,  87,  112, 61, 69,  93,  121, 68, 75,  100, 130,
+          74,  80, 105, 137, 78,  84, 109, 142, 83,  88, 114, 148},
+         {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+          42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+          46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+          56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+        {{31, 36, 53, 79,  32, 35, 51, 75,  32, 34, 49,  72,  32, 36, 50,  71,
+          33, 38, 49, 69,  34, 42, 54, 73,  34, 48, 60,  78,  37, 50, 65,  84,
+          41, 53, 71, 90,  45, 56, 76, 96,  49, 60, 82,  103, 54, 63, 87,  110,
+          60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+         {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+          40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+          45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+          52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+        {{31, 34, 44, 65, 32, 34, 43, 62,  32, 33, 41, 59,  32, 35, 43, 59,
+          32, 37, 43, 58, 34, 39, 48, 63,  34, 42, 53, 67,  36, 44, 57, 71,
+          39, 46, 60, 76, 42, 48, 64, 81,  45, 51, 67, 85,  50, 54, 72, 92,
+          54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+         {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+          40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+          46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+          50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+        {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+          32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+          37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+          50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+         {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+          37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+          46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+          48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+        {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+          32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+          35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+          45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+         {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+          35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+          47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+          47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+        {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+          32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+          34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+          40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+         {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+          34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+          42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+          47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+          32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+          32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+          35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+         {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+          31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+          38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+          48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+        {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+          32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+          34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+          31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+          36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+          42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+          32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+         {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+          31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+          33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+          38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+        {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+          31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+          33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  33,  34,  49,  59,
+          78,  86,  93,  32,  34,  36,  50,  59,  77,  82,  89,  34,  37,
+          42,  54,  63,  79,  80,  88,  36,  38,  48,  60,  68,  84,  86,
+          90,  44,  43,  53,  71,  79,  95,  94,  97,  48,  46,  56,  76,
+          85,  102, 105, 105, 58,  54,  63,  87,  98,  116, 112, 115, 65,
+          58,  68,  92,  105, 124, 122, 124, 79,  70,  79,  104, 118, 141,
+          135, 135, 82,  72,  81,  106, 121, 144, 149, 146, 91,  80,  88,
+          106, 130, 148, 162, 159, 97,  86,  94,  107, 128, 157, 167, 171,
+          103, 93,  98,  114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+          161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+         {32, 37, 48, 52, 57, 66, 68,  71,  30, 40, 46, 48, 52, 60, 63,  66,
+          33, 43, 47, 47, 51, 59, 60,  63,  42, 47, 50, 50, 53, 60, 59,  62,
+          49, 48, 53, 54, 57, 62, 62,  62,  49, 46, 53, 61, 64, 69, 66,  66,
+          50, 46, 54, 64, 67, 73, 72,  70,  54, 49, 55, 68, 73, 80, 76,  75,
+          57, 50, 56, 70, 76, 84, 80,  79,  63, 55, 60, 75, 82, 92, 87,  84,
+          64, 56, 61, 75, 83, 93, 93,  89,  68, 59, 64, 74, 86, 94, 98,  94,
+          70, 62, 66, 73, 83, 96, 99,  98,  72, 64, 66, 75, 83, 92, 101, 104,
+          74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  44,  60,
+          72,  84,  90,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          40,  47,  60,  71,  78,  85,  36,  37,  48,  56,  68,  78,  83,
+          87,  39,  40,  50,  60,  73,  84,  91,  94,  47,  45,  56,  69,
+          84,  95,  101, 101, 53,  50,  60,  75,  92,  103, 108, 110, 61,
+          56,  65,  81,  100, 113, 116, 118, 71,  64,  73,  89,  111, 125,
+          129, 129, 79,  70,  79,  95,  118, 133, 142, 138, 86,  76,  84,
+          100, 124, 140, 153, 150, 92,  82,  89,  101, 121, 148, 157, 161,
+          98,  88,  93,  108, 124, 141, 163, 174, 104, 94,  95,  110, 129,
+          151, 171, 181, 110, 100, 98,  111, 127, 147, 169, 188},
+         {32, 35, 48, 50, 57, 63, 68,  70,  30, 38, 46, 46, 52, 58, 63, 65,
+          33, 41, 47, 46, 51, 56, 60,  63,  39, 46, 48, 47, 51, 55, 58, 61,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 46, 53, 56, 60, 64, 65, 65,
+          50, 46, 54, 61, 66, 70, 71,  69,  52, 47, 54, 63, 71, 75, 75, 74,
+          55, 49, 56, 65, 74, 79, 79,  78,  60, 53, 58, 68, 79, 85, 85, 82,
+          63, 55, 60, 70, 82, 89, 91,  87,  66, 58, 62, 72, 84, 91, 95, 91,
+          68, 60, 64, 71, 81, 94, 97,  96,  70, 62, 65, 73, 81, 89, 98, 101,
+          72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  41,  54,
+          73,  81,  88,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          38,  42,  52,  69,  76,  82,  34,  36,  44,  50,  59,  75,  81,
+          84,  39,  39,  50,  58,  68,  84,  88,  90,  44,  42,  53,  63,
+          74,  90,  97,  97,  49,  46,  57,  67,  81,  97,  104, 105, 57,
+          53,  63,  74,  90,  108, 111, 113, 65,  59,  68,  79,  97,  118,
+          123, 122, 71,  64,  73,  84,  102, 125, 135, 131, 81,  72,  80,
+          91,  110, 135, 145, 141, 87,  77,  85,  96,  114, 140, 148, 151,
+          92,  83,  88,  102, 117, 133, 153, 163, 98,  88,  89,  103, 121,
+          141, 160, 169, 103, 94,  92,  103, 119, 137, 158, 175},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 36, 46, 46, 50, 58, 62, 65,
+          33, 40, 47, 46, 49, 56, 59, 62,  37, 44, 47, 45, 48, 54, 57, 60,
+          44, 46, 51, 51, 53, 59, 60, 61,  48, 46, 53, 56, 58, 64, 64, 64,
+          49, 45, 53, 58, 62, 67, 70, 68,  51, 47, 54, 60, 65, 71, 73, 72,
+          54, 49, 55, 62, 70, 77, 77, 76,  57, 51, 56, 64, 73, 82, 83, 81,
+          60, 53, 58, 65, 75, 85, 89, 85,  64, 57, 61, 68, 78, 89, 93, 89,
+          66, 59, 63, 69, 79, 91, 94, 93,  68, 61, 63, 71, 79, 87, 96, 98,
+          70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+        {{32, 31, 35,  44,  53,  65,  82,  90, 31, 32, 34,  41,  50,  61,  76,
+          85, 31, 33,  35,  42,  49,  59,  73, 81, 32, 34,  37,  42,  49,  58,
+          71, 79, 34,  35,  41,  48,  54,  63, 76, 81, 36,  36,  46,  54,  60,
+          68, 80, 87,  41,  40,  49,  60,  67, 76, 88, 93,  47,  44,  53,  66,
+          75, 84, 97,  101, 53,  50,  57,  71, 82, 92, 106, 108, 58,  54,  61,
+          75, 87, 98,  112, 116, 65,  59,  66, 79, 92, 105, 120, 124, 74,  67,
+          73, 86, 100, 113, 131, 134, 82,  73, 79, 92, 105, 120, 139, 142, 87,
+          78, 83, 96,  110, 125, 144, 153, 92, 83, 84, 97,  114, 132, 150, 157,
+          97, 88, 86,  97,  111, 128, 147, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+          33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+          42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+          48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+          57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+          64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+          68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+        {{32, 31, 33, 40,  51,  65,  79,  87, 31, 32, 33, 39,  49,  61,  74,
+          82, 31, 32, 34,  38,  47,  59,  71, 79, 32, 33, 36,  40,  48,  58,
+          69, 77, 33, 34,  38,  44,  52,  62, 72, 78, 36, 35,  42,  51,  58,
+          68, 78, 84, 39,  38,  44,  54,  63, 73, 84, 89, 44,  41,  46,  59,
+          69, 79, 90, 96,  48,  45,  50,  62, 74, 85, 96, 103, 53,  49,  53,
+          66, 79, 92, 103, 111, 58,  54,  57, 70, 84, 98, 110, 118, 66,  60,
+          63, 75, 90, 106, 119, 126, 74,  67, 69, 81, 97, 113, 128, 134, 81,
+          73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+          91, 82, 80, 90,  103, 119, 137, 151},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+          31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+          41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+          48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+          50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+          61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+          66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+        {{32, 31, 32, 36, 44, 53, 62,  73,  31, 32, 32, 35, 42, 51,  59,  69,
+          31, 32, 33, 34, 41, 49, 57,  66,  32, 32, 34, 36, 42, 50,  57,  65,
+          32, 33, 35, 38, 42, 49, 56,  64,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59, 65,  73,  38, 36, 40, 49, 56, 63,  69,  77,
+          41, 39, 41, 51, 60, 67, 74,  81,  44, 42, 43, 54, 64, 72,  79,  86,
+          48, 45, 46, 56, 67, 76, 83,  91,  53, 49, 50, 60, 71, 82,  90,  99,
+          58, 54, 54, 63, 75, 87, 95,  105, 65, 60, 58, 68, 79, 92,  102, 112,
+          71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+          30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+          54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+          60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 51, 62,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 34, 35, 38, 42, 49, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          34, 34, 37, 41, 44, 48, 54, 63,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  41, 39, 41, 49, 54, 60, 67, 76,
+          44, 41, 43, 51, 57, 63, 71, 79,  48, 45, 46, 54, 60, 67, 76, 85,
+          53, 49, 50, 57, 64, 71, 82, 92,  57, 53, 53, 60, 67, 74, 86, 97,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+          30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+          52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+          39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+          47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+          53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+          33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+          52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+          34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+          36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+          42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+          42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  32,  35,  51,  62,  82,
+          88,  94,  31,  33,  34,  49,  59,  78,  86,  93,  31,  33,  35,  49,
+          59,  78,  84,  90,  32,  34,  36,  50,  59,  77,  82,  89,  32,  35,
+          38,  49,  58,  75,  82,  89,  34,  37,  42,  54,  63,  79,  80,  88,
+          35,  37,  45,  57,  65,  82,  84,  87,  36,  38,  48,  60,  68,  84,
+          86,  90,  39,  40,  50,  65,  73,  89,  91,  93,  44,  43,  53,  71,
+          79,  95,  94,  97,  46,  44,  55,  73,  82,  98,  98,  99,  48,  46,
+          56,  76,  85,  102, 105, 105, 53,  50,  60,  82,  92,  109, 107, 107,
+          58,  54,  63,  87,  98,  116, 112, 115, 61,  56,  66,  89,  101, 120,
+          119, 116, 65,  58,  68,  92,  105, 124, 122, 124, 71,  63,  73,  97,
+          111, 132, 130, 127, 79,  70,  79,  104, 118, 141, 135, 135, 81,  71,
+          80,  105, 119, 142, 140, 139, 82,  72,  81,  106, 121, 144, 149, 146,
+          88,  77,  85,  108, 126, 149, 153, 152, 91,  80,  88,  106, 130, 148,
+          162, 159, 94,  83,  91,  105, 131, 153, 165, 166, 97,  86,  94,  107,
+          128, 157, 167, 171, 100, 89,  97,  111, 127, 152, 173, 182, 103, 93,
+          98,  114, 131, 150, 174, 186, 107, 96,  100, 117, 136, 155, 177, 191,
+          110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+          185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+          136, 156, 179, 204},
+         {32, 37, 48, 52, 57, 66, 68,  71,  31, 38, 47, 50, 54, 63, 65,  67,
+          30, 40, 46, 48, 52, 60, 63,  66,  32, 41, 46, 48, 51, 59, 62,  64,
+          33, 43, 47, 47, 51, 59, 60,  63,  37, 47, 47, 47, 50, 57, 60,  62,
+          42, 47, 50, 50, 53, 60, 59,  62,  45, 47, 51, 52, 55, 61, 61,  61,
+          49, 48, 53, 54, 57, 62, 62,  62,  48, 47, 53, 57, 60, 66, 65,  64,
+          49, 46, 53, 61, 64, 69, 66,  66,  49, 46, 53, 62, 65, 71, 68,  67,
+          50, 46, 54, 64, 67, 73, 72,  70,  52, 47, 54, 66, 71, 77, 73,  71,
+          54, 49, 55, 68, 73, 80, 76,  75,  55, 49, 56, 69, 75, 82, 79,  76,
+          57, 50, 56, 70, 76, 84, 80,  79,  60, 52, 58, 72, 79, 88, 84,  81,
+          63, 55, 60, 75, 82, 92, 87,  84,  64, 55, 61, 75, 82, 92, 89,  86,
+          64, 56, 61, 75, 83, 93, 93,  89,  67, 58, 63, 76, 85, 95, 94,  91,
+          68, 59, 64, 74, 86, 94, 98,  94,  69, 60, 65, 72, 85, 95, 99,  97,
+          70, 62, 66, 73, 83, 96, 99,  98,  71, 63, 67, 74, 82, 93, 102, 102,
+          72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+          74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+          76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99,  108}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  45,  62,  75,
+          86,  91,  31,  32,  35,  44,  60,  72,  84,  90,  31,  33,  35,  44,
+          59,  71,  82,  87,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          38,  45,  58,  69,  80,  86,  32,  35,  40,  47,  60,  71,  78,  85,
+          34,  36,  42,  50,  63,  73,  82,  84,  36,  37,  48,  56,  68,  78,
+          83,  87,  38,  39,  49,  58,  71,  81,  88,  90,  39,  40,  50,  60,
+          73,  84,  91,  94,  44,  42,  53,  66,  79,  90,  94,  96,  47,  45,
+          56,  69,  84,  95,  101, 101, 49,  47,  57,  71,  86,  97,  103, 102,
+          53,  50,  60,  75,  92,  103, 108, 110, 58,  54,  63,  79,  98,  110,
+          114, 111, 61,  56,  65,  81,  100, 113, 116, 118, 65,  59,  68,  84,
+          105, 118, 124, 121, 71,  64,  73,  89,  111, 125, 129, 129, 76,  68,
+          76,  92,  115, 130, 134, 132, 79,  70,  79,  95,  118, 133, 142, 138,
+          82,  73,  81,  97,  121, 136, 145, 144, 86,  76,  84,  100, 124, 140,
+          153, 150, 89,  79,  87,  99,  124, 145, 156, 156, 92,  82,  89,  101,
+          121, 148, 157, 161, 95,  85,  92,  105, 120, 143, 163, 171, 98,  88,
+          93,  108, 124, 141, 163, 174, 101, 91,  94,  110, 128, 146, 166, 179,
+          104, 94,  95,  110, 129, 151, 171, 181, 107, 97,  96,  110, 128, 149,
+          173, 188, 110, 100, 98,  111, 127, 147, 169, 188, 114, 104, 100, 111,
+          127, 145, 166, 190},
+         {32, 35, 48, 50, 57, 63, 68,  70,  31, 37, 47, 48, 54, 60, 64,  66,
+          30, 38, 46, 46, 52, 58, 63,  65,  31, 38, 46, 46, 52, 57, 61,  63,
+          33, 41, 47, 46, 51, 56, 60,  63,  37, 45, 47, 46, 50, 54, 59,  62,
+          39, 46, 48, 47, 51, 55, 58,  61,  42, 46, 50, 50, 53, 57, 60,  60,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 47, 53, 55, 58, 62, 64,  63,
+          48, 46, 53, 56, 60, 64, 65,  65,  49, 45, 53, 59, 64, 67, 67,  66,
+          50, 46, 54, 61, 66, 70, 71,  69,  51, 47, 54, 61, 68, 71, 72,  70,
+          52, 47, 54, 63, 71, 75, 75,  74,  54, 49, 55, 65, 73, 78, 78,  74,
+          55, 49, 56, 65, 74, 79, 79,  78,  57, 50, 56, 66, 76, 82, 83,  79,
+          60, 53, 58, 68, 79, 85, 85,  82,  62, 54, 60, 69, 81, 87, 87,  84,
+          63, 55, 60, 70, 82, 89, 91,  87,  64, 56, 61, 71, 83, 90, 92,  89,
+          66, 58, 62, 72, 84, 91, 95,  91,  67, 59, 63, 71, 83, 93, 96,  94,
+          68, 60, 64, 71, 81, 94, 97,  96,  69, 61, 65, 72, 80, 91, 99,  100,
+          70, 62, 65, 73, 81, 89, 98,  101, 71, 64, 65, 73, 82, 90, 99,  103,
+          72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+          74, 67, 65, 71, 79, 89, 98,  105, 75, 68, 65, 71, 78, 87, 96,  105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  42,  55,  75,
+          83,  88,  31,  32,  35,  41,  54,  73,  81,  88,  31,  32,  34,  41,
+          53,  72,  79,  84,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          37,  42,  53,  70,  77,  83,  32,  34,  38,  42,  52,  69,  76,  82,
+          34,  35,  42,  48,  57,  73,  79,  81,  34,  36,  44,  50,  59,  75,
+          81,  84,  36,  37,  48,  54,  63,  78,  85,  86,  39,  39,  50,  58,
+          68,  84,  88,  90,  40,  40,  51,  59,  70,  85,  91,  92,  44,  42,
+          53,  63,  74,  90,  97,  97,  47,  45,  56,  66,  79,  95,  99,  98,
+          49,  46,  57,  67,  81,  97,  104, 105, 53,  50,  60,  71,  86,  103,
+          109, 106, 57,  53,  63,  74,  90,  108, 111, 113, 59,  54,  64,  75,
+          91,  111, 119, 115, 65,  59,  68,  79,  97,  118, 123, 122, 69,  62,
+          71,  83,  100, 122, 127, 125, 71,  64,  73,  84,  102, 125, 135, 131,
+          79,  71,  79,  90,  109, 133, 137, 136, 81,  72,  80,  91,  110, 135,
+          145, 141, 82,  73,  81,  92,  111, 136, 147, 147, 87,  77,  85,  96,
+          114, 140, 148, 151, 90,  80,  87,  99,  113, 135, 153, 160, 92,  83,
+          88,  102, 117, 133, 153, 163, 95,  85,  88,  103, 120, 137, 155, 168,
+          98,  88,  89,  103, 121, 141, 160, 169, 100, 91,  90,  103, 120, 139,
+          161, 175, 103, 94,  92,  103, 119, 137, 158, 175, 106, 97,  93,  104,
+          118, 135, 155, 176},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 35, 47, 47, 51, 60, 63, 65,
+          31, 36, 46, 46, 50, 58, 62, 65,  30, 36, 46, 45, 49, 57, 60, 62,
+          33, 40, 47, 46, 49, 56, 59, 62,  35, 42, 47, 45, 48, 55, 58, 61,
+          37, 44, 47, 45, 48, 54, 57, 60,  42, 45, 50, 49, 51, 57, 59, 59,
+          44, 46, 51, 51, 53, 59, 60, 61,  49, 47, 53, 53, 55, 60, 63, 62,
+          48, 46, 53, 56, 58, 64, 64, 64,  48, 46, 53, 56, 59, 65, 66, 65,
+          49, 45, 53, 58, 62, 67, 70, 68,  50, 46, 54, 59, 65, 70, 70, 68,
+          51, 47, 54, 60, 65, 71, 73, 72,  52, 47, 54, 61, 68, 75, 76, 73,
+          54, 49, 55, 62, 70, 77, 77, 76,  54, 49, 55, 62, 70, 78, 81, 77,
+          57, 51, 56, 64, 73, 82, 83, 81,  59, 52, 58, 65, 74, 84, 85, 82,
+          60, 53, 58, 65, 75, 85, 89, 85,  63, 56, 60, 67, 77, 89, 90, 87,
+          64, 57, 61, 68, 78, 89, 93, 89,  64, 57, 61, 68, 78, 90, 94, 92,
+          66, 59, 63, 69, 79, 91, 94, 93,  67, 60, 63, 70, 78, 88, 96, 97,
+          68, 61, 63, 71, 79, 87, 96, 98,  69, 62, 63, 71, 80, 88, 96, 100,
+          70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+          72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+        {{32,  31,  35,  44,  53,  65,  82,  90,  31,  32,  35,  42,  51,  62,
+          78,  86,  31,  32,  34,  41,  50,  61,  76,  85,  31,  32,  34,  41,
+          49,  59,  74,  82,  31,  33,  35,  42,  49,  59,  73,  81,  32,  33,
+          36,  42,  50,  59,  73,  80,  32,  34,  37,  42,  49,  58,  71,  79,
+          32,  34,  39,  44,  51,  60,  73,  78,  34,  35,  41,  48,  54,  63,
+          76,  81,  35,  36,  45,  52,  59,  67,  79,  83,  36,  36,  46,  54,
+          60,  68,  80,  87,  39,  39,  48,  58,  65,  73,  86,  88,  41,  40,
+          49,  60,  67,  76,  88,  93,  44,  42,  51,  63,  71,  79,  92,  94,
+          47,  44,  53,  66,  75,  84,  97,  101, 48,  45,  54,  67,  76,  85,
+          98,  101, 53,  50,  57,  71,  82,  92,  106, 108, 55,  51,  59,  72,
+          84,  94,  108, 110, 58,  54,  61,  75,  87,  98,  112, 116, 63,  58,
+          65,  78,  91,  103, 118, 119, 65,  59,  66,  79,  92,  105, 120, 124,
+          71,  64,  71,  84,  97,  111, 127, 129, 74,  67,  73,  86,  100, 113,
+          131, 134, 79,  71,  77,  90,  104, 118, 136, 139, 82,  73,  79,  92,
+          105, 120, 139, 142, 82,  74,  79,  92,  106, 121, 139, 150, 87,  78,
+          83,  96,  110, 125, 144, 153, 89,  81,  83,  97,  113, 128, 145, 157,
+          92,  83,  84,  97,  114, 132, 150, 157, 94,  85,  85,  97,  112, 130,
+          151, 163, 97,  88,  86,  97,  111, 128, 147, 163, 99,  91,  87,  97,
+          110, 126, 144, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+          31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+          33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+          37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+          42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+          49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+          48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+          50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+          54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+          57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+          61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+          64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+          66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+          68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+          70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+        {{32,  31,  33,  40,  51,  65,  79,  87,  31,  32,  33,  39,  49,  62,
+          75,  83,  31,  32,  33,  39,  49,  61,  74,  82,  31,  32,  33,  38,
+          47,  59,  72,  79,  31,  32,  34,  38,  47,  59,  71,  79,  32,  33,
+          35,  39,  48,  59,  71,  78,  32,  33,  36,  40,  48,  58,  69,  77,
+          32,  33,  36,  41,  48,  58,  69,  75,  33,  34,  38,  44,  52,  62,
+          72,  78,  34,  34,  39,  45,  53,  63,  73,  80,  36,  35,  42,  51,
+          58,  68,  78,  84,  36,  35,  42,  51,  59,  68,  79,  85,  39,  38,
+          44,  54,  63,  73,  84,  89,  40,  39,  45,  56,  65,  75,  85,  90,
+          44,  41,  46,  59,  69,  79,  90,  96,  46,  43,  48,  60,  72,  82,
+          93,  97,  48,  45,  50,  62,  74,  85,  96,  103, 52,  48,  52,  65,
+          78,  90,  101, 105, 53,  49,  53,  66,  79,  92,  103, 111, 58,  53,
+          57,  69,  83,  97,  109, 113, 58,  54,  57,  70,  84,  98,  110, 118,
+          65,  59,  62,  74,  89,  105, 118, 122, 66,  60,  63,  75,  90,  106,
+          119, 126, 71,  65,  67,  79,  94,  111, 125, 131, 74,  67,  69,  81,
+          97,  113, 128, 134, 79,  72,  73,  85,  101, 118, 133, 141, 81,  73,
+          75,  86,  102, 120, 135, 143, 82,  74,  75,  87,  103, 121, 136, 147,
+          86,  78,  78,  90,  106, 124, 140, 147, 88,  80,  80,  90,  105, 122,
+          140, 152, 91,  82,  80,  90,  103, 119, 137, 151, 93,  85,  81,  90,
+          103, 117, 134, 152},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+          31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+          31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+          35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+          41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+          49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+          48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+          49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+          50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+          52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+          57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+          61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+          64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+          66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+          68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 32, 35, 42, 51,  62,  75,  31, 32, 33, 34, 41, 49,  59,  72,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 32, 34, 36, 42, 50,  59,  71,  32, 33, 35, 38, 42, 49,  58,  69,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          34, 34, 37, 42, 48, 54,  63,  73,  36, 34, 38, 48, 54, 60,  68,  78,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          39, 37, 40, 50, 58, 65,  73,  84,  44, 41, 43, 53, 63, 71,  79,  90,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          48, 45, 46, 56, 67, 76,  85,  96,  53, 49, 50, 60, 71, 82,  92,  103,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          58, 54, 54, 63, 75, 87,  98,  110, 65, 60, 58, 68, 79, 92,  105, 118,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          71, 65, 63, 73, 84, 97,  111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+          82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+          64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+        {{32, 31, 32, 36, 44, 53,  62,  73,  31, 32, 32, 35, 42, 51,  60,  70,
+          31, 32, 32, 35, 42, 51,  59,  69,  31, 32, 32, 35, 41, 50,  58,  67,
+          31, 32, 33, 34, 41, 49,  57,  66,  31, 32, 33, 35, 41, 49,  57,  66,
+          32, 32, 34, 36, 42, 50,  57,  65,  32, 32, 34, 37, 42, 49,  56,  65,
+          32, 33, 35, 38, 42, 49,  56,  64,  32, 33, 35, 39, 43, 50,  56,  64,
+          34, 34, 37, 42, 48, 54,  61,  69,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59,  65,  73,  36, 34, 38, 48, 54, 60,  66,  74,
+          38, 36, 40, 49, 56, 63,  69,  77,  39, 37, 40, 50, 58, 65,  71,  79,
+          41, 39, 41, 51, 60, 67,  74,  81,  44, 41, 43, 53, 63, 71,  78,  85,
+          44, 42, 43, 54, 64, 72,  79,  86,  48, 45, 46, 56, 67, 76,  83,  91,
+          48, 45, 46, 56, 67, 76,  83,  91,  53, 49, 49, 59, 71, 81,  89,  98,
+          53, 49, 50, 60, 71, 82,  90,  99,  57, 52, 52, 62, 74, 85,  94,  103,
+          58, 54, 54, 63, 75, 87,  95,  105, 61, 57, 56, 66, 77, 89,  98,  108,
+          65, 60, 58, 68, 79, 92,  102, 112, 67, 61, 60, 69, 81, 94,  103, 114,
+          71, 65, 63, 73, 84, 97,  108, 119, 72, 66, 64, 73, 85, 98,  108, 119,
+          79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+          31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+          30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+          33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+          37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+          42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+          48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+          49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+          52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+          54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+          57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+          60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+          63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 52, 63,
+          31, 32, 32, 35, 38, 42, 51, 62,  31, 32, 32, 34, 37, 41, 50, 61,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 33, 34, 37, 41, 49, 59,
+          31, 32, 34, 35, 38, 42, 49, 59,  32, 32, 34, 36, 38, 42, 50, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          32, 33, 35, 37, 40, 42, 49, 58,  33, 33, 36, 40, 43, 46, 53, 62,
+          34, 34, 37, 41, 44, 48, 54, 63,  34, 34, 37, 43, 46, 50, 56, 65,
+          36, 34, 38, 46, 50, 54, 60, 68,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  39, 37, 40, 48, 53, 58, 65, 73,
+          41, 39, 41, 49, 54, 60, 67, 76,  44, 41, 43, 51, 57, 63, 71, 79,
+          44, 41, 43, 51, 57, 63, 71, 79,  47, 44, 45, 53, 59, 66, 75, 84,
+          48, 45, 46, 54, 60, 67, 76, 85,  50, 46, 47, 55, 61, 68, 78, 88,
+          53, 49, 50, 57, 64, 71, 82, 92,  53, 49, 50, 57, 64, 71, 82, 92,
+          57, 53, 53, 60, 67, 74, 86, 97,  58, 54, 54, 61, 68, 75, 87, 98,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+          65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+          31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+          30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+          33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+          42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+          49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+          48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+          50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+          52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+          54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+          57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+          31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+          31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+          32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+          36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+          38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+          39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+          44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+          47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+          49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+          53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+          58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+          31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+          31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+          33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+          37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+          49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+          48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+          49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+          51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+          52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+          54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+          31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+          32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+          34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+          35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+          36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+          39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+          48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+          31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+          39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+          42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+          47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+          48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+          50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+          34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+          36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+          39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+          44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+          30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+          33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+          42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+          49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+          49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+          33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+          36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+          41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+          42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+          44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+          32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+          34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+          31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+          35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+          42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+          31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+          30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+          31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+          30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+        {{32,  31,  32,  34,  36,  44,  53,  59,  65,  79,  87,  90,  93,  96,
+          99,  102, 31,  32,  32,  34,  35,  42,  51,  56,  62,  75,  82,  85,
+          88,  91,  94,  97,  31,  32,  33,  33,  34,  41,  49,  54,  59,  72,
+          78,  82,  86,  90,  93,  97,  31,  32,  33,  34,  35,  41,  49,  54,
+          59,  71,  78,  81,  84,  87,  90,  93,  32,  32,  34,  35,  36,  42,
+          50,  54,  59,  71,  77,  80,  82,  86,  89,  93,  32,  33,  35,  37,
+          38,  42,  49,  53,  58,  69,  75,  78,  82,  86,  89,  92,  34,  34,
+          37,  39,  42,  48,  54,  58,  63,  73,  79,  78,  80,  83,  88,  92,
+          35,  34,  37,  41,  45,  50,  57,  61,  65,  76,  82,  83,  84,  84,
+          87,  90,  36,  34,  38,  43,  48,  54,  60,  64,  68,  78,  84,  87,
+          86,  89,  90,  90,  39,  37,  40,  45,  50,  58,  65,  69,  73,  84,
+          89,  89,  91,  91,  93,  96,  44,  41,  43,  48,  53,  63,  71,  75,
+          79,  90,  95,  93,  94,  95,  97,  97,  46,  43,  44,  49,  55,  65,
+          73,  78,  82,  93,  98,  100, 98,  100, 99,  103, 48,  45,  46,  51,
+          56,  67,  76,  80,  85,  96,  102, 102, 105, 102, 105, 104, 53,  49,
+          50,  54,  60,  71,  82,  87,  92,  103, 109, 107, 107, 110, 107, 111,
+          58,  54,  54,  58,  63,  75,  87,  92,  98,  110, 116, 115, 112, 111,
+          115, 112, 61,  57,  56,  60,  66,  77,  89,  95,  101, 114, 120, 118,
+          119, 118, 116, 120, 65,  60,  58,  63,  68,  79,  92,  98,  105, 118,
+          124, 123, 122, 123, 124, 121, 71,  65,  63,  68,  73,  84,  97,  103,
+          111, 125, 132, 132, 130, 128, 127, 130, 79,  72,  70,  74,  79,  90,
+          104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,  74,  71,  75,
+          80,  91,  105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82,  75,
+          72,  76,  81,  92,  106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+          88,  80,  77,  80,  85,  97,  108, 115, 126, 142, 149, 153, 153, 152,
+          152, 154, 91,  83,  80,  81,  88,  100, 106, 114, 130, 142, 148, 155,
+          162, 160, 159, 155, 94,  85,  83,  82,  91,  100, 105, 118, 131, 137,
+          153, 160, 165, 167, 166, 168, 97,  88,  86,  85,  94,  100, 107, 123,
+          128, 140, 157, 161, 167, 173, 171, 169, 100, 91,  89,  87,  97,  100,
+          111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94,  93,  90,
+          98,  101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+          96,  93,  100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+          110, 101, 100, 97,  101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+          193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+          185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+          157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+          136, 156, 156, 178, 179, 203, 204, 217},
+         {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67,  68,  69,  71,  72,
+          31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64,  65,  66,  67,  68,
+          30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62,  63,  65,  66,  68,
+          32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61,  62,  63,  64,  65,
+          33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60,  60,  62,  63,  65,
+          37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58,  60,  61,  62,  63,
+          42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,  59,  60,  62,  63,
+          45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61,  61,  60,  61,  61,
+          49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63,  62,  63,  62,  62,
+          48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65,  65,  64,  64,  65,
+          49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67,  66,  66,  66,  65,
+          49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70,  68,  68,  67,  68,
+          50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,  72,  70,  70,  69,
+          52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74,  73,  73,  71,  72,
+          54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78,  76,  74,  75,  73,
+          55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,  79,  78,  76,  77,
+          57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82,  80,  80,  79,  77,
+          60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86,  84,  82,  81,  81,
+          63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88,  87,  85,  84,  81,
+          64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90,  89,  87,  86,  86,
+          64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95,  93,  91,  89,  87,
+          67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96,  94,  92,  91,  91,
+          68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96,  98,  96,  94,  91,
+          69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98,  99,  98,  97,  96,
+          70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,  99,  101, 98,  97,
+          71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98,  102, 102, 102, 101,
+          72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+          73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98,  102, 105, 106, 107,
+          74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96,  103, 105, 106, 107,
+          75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96,  103, 105, 109, 109,
+          76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97,  101, 107, 109, 110,
+          77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99,  99,  108, 108, 113}},
+        {{32,  31,  32,  32,  36,  44,  47,  53,  65,  73,  79,  87,  90,  93,
+          96,  99,  31,  32,  32,  33,  35,  42,  45,  51,  62,  69,  75,  83,
+          86,  88,  91,  94,  31,  32,  32,  33,  35,  41,  44,  49,  60,  67,
+          72,  80,  84,  87,  90,  94,  31,  32,  33,  33,  35,  41,  44,  49,
+          59,  66,  71,  79,  82,  84,  87,  90,  32,  32,  34,  34,  36,  42,
+          45,  50,  59,  65,  71,  78,  80,  83,  87,  90,  32,  33,  35,  36,
+          38,  42,  45,  49,  58,  64,  69,  76,  80,  83,  86,  88,  32,  33,
+          35,  36,  40,  44,  47,  51,  60,  66,  71,  76,  78,  81,  85,  89,
+          34,  34,  36,  38,  42,  48,  50,  54,  63,  69,  73,  80,  82,  81,
+          84,  86,  36,  34,  37,  40,  48,  54,  56,  60,  68,  74,  78,  84,
+          83,  86,  87,  87,  38,  36,  39,  41,  49,  56,  58,  63,  71,  77,
+          81,  86,  88,  88,  90,  93,  39,  37,  40,  42,  50,  58,  60,  65,
+          73,  79,  84,  90,  91,  92,  94,  93,  44,  41,  42,  45,  53,  63,
+          66,  71,  79,  85,  90,  96,  94,  96,  96,  99,  47,  44,  45,  47,
+          56,  66,  69,  75,  84,  90,  95,  99,  101, 98,  101, 99,  49,  46,
+          47,  48,  57,  67,  71,  77,  86,  93,  97,  103, 103, 105, 102, 106,
+          53,  49,  50,  51,  60,  71,  75,  82,  92,  99,  103, 111, 108, 107,
+          110, 107, 58,  54,  54,  55,  63,  75,  79,  87,  98,  105, 110, 114,
+          114, 113, 111, 115, 61,  56,  56,  57,  65,  77,  81,  89,  100, 107,
+          113, 118, 116, 117, 118, 116, 65,  60,  59,  60,  68,  79,  84,  92,
+          105, 112, 118, 126, 124, 122, 121, 124, 71,  65,  64,  65,  73,  84,
+          89,  97,  111, 119, 125, 130, 129, 129, 129, 125, 76,  69,  68,  69,
+          76,  88,  92,  101, 115, 123, 130, 134, 134, 131, 132, 135, 79,  72,
+          70,  71,  79,  90,  95,  104, 118, 127, 133, 143, 142, 141, 138, 136,
+          82,  75,  73,  74,  81,  92,  97,  106, 121, 130, 136, 146, 145, 144,
+          144, 145, 86,  78,  76,  77,  84,  95,  100, 109, 124, 133, 140, 147,
+          153, 151, 150, 146, 89,  81,  79,  78,  87,  95,  99,  112, 124, 130,
+          145, 152, 156, 157, 156, 158, 92,  84,  82,  80,  89,  95,  101, 116,
+          121, 132, 148, 151, 157, 163, 161, 159, 95,  86,  85,  83,  92,  95,
+          105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98,  89,  88,  85,
+          93,  95,  108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+          91,  88,  94,  98,  110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+          104, 95,  94,  91,  95,  101, 110, 115, 129, 132, 151, 154, 171, 175,
+          181, 186, 107, 98,  97,  94,  96,  105, 110, 119, 128, 136, 149, 156,
+          173, 177, 188, 192, 110, 101, 100, 97,  98,  108, 111, 123, 127, 141,
+          147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+          127, 145, 145, 166, 166, 189, 190, 201},
+         {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68,  69,  70,  71,
+          31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64,  65,  66,  67,
+          30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63,  64,  65,  67,
+          31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61,  62,  63,  64,
+          33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60,  61,  63,  64,
+          37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59,  61,  62,  62,
+          39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58,  59,  61,  62,
+          42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60,  59,  60,  60,
+          49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61,  62,  61,  61,
+          48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64,  63,  63,  64,
+          48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65,  65,  65,  64,
+          49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67,  67,  66,  67,
+          50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71,  68,  69,  67,
+          51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,  72,  70,  71,
+          52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75,  73,  74,  71,
+          54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78,  76,  74,  75,
+          55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79,  78,  78,  75,
+          57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83,  81,  79,  79,
+          60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85,  84,  82,  80,
+          62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87,  85,  84,  84,
+          63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91,  89,  87,  84,
+          64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92,  90,  89,  89,
+          66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95,  93,  91,  89,
+          67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96,  96,  94,  94,
+          68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97,  98,  96,  94,
+          69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99,  99,  100, 98,
+          70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98,  100, 101, 99,
+          71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99,  102, 103, 104,
+          72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+          73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+          74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98,  103, 105, 106,
+          75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96,  105, 105, 109}},
+        {{32,  31,  32,  32,  36,  39,  44,  53,  58,  65,  79,  81,  88,  90,
+          93,  96,  31,  32,  32,  32,  35,  38,  42,  51,  55,  62,  75,  77,
+          83,  86,  88,  91,  31,  32,  32,  32,  35,  38,  41,  50,  54,  60,
+          73,  75,  81,  84,  88,  91,  31,  32,  32,  33,  34,  37,  41,  49,
+          53,  59,  72,  74,  79,  82,  84,  87,  32,  32,  33,  34,  36,  39,
+          42,  50,  53,  59,  71,  72,  78,  81,  84,  87,  32,  32,  34,  34,
+          37,  40,  42,  49,  53,  58,  70,  71,  77,  80,  83,  85,  32,  33,
+          34,  35,  38,  40,  42,  49,  52,  58,  69,  70,  76,  78,  82,  86,
+          34,  34,  35,  37,  42,  45,  48,  54,  57,  63,  73,  75,  79,  79,
+          81,  83,  34,  34,  36,  37,  44,  47,  50,  56,  59,  65,  75,  77,
+          81,  83,  84,  84,  36,  34,  37,  38,  48,  51,  54,  60,  63,  68,
+          78,  80,  85,  85,  86,  89,  39,  37,  39,  40,  50,  54,  58,  65,
+          68,  73,  84,  85,  88,  89,  90,  89,  40,  38,  40,  41,  51,  55,
+          59,  67,  70,  75,  85,  87,  91,  92,  92,  95,  44,  41,  42,  43,
+          53,  58,  63,  71,  74,  79,  90,  91,  97,  94,  97,  95,  47,  44,
+          45,  46,  56,  61,  66,  75,  79,  85,  95,  97,  99,  101, 98,  102,
+          49,  46,  46,  47,  57,  62,  67,  77,  81,  86,  97,  99,  104, 102,
+          105, 102, 53,  49,  50,  50,  60,  65,  71,  82,  86,  92,  103, 105,
+          109, 108, 106, 110, 57,  53,  53,  53,  63,  68,  74,  86,  90,  97,
+          108, 110, 111, 112, 113, 110, 59,  54,  54,  54,  64,  69,  75,  87,
+          91,  98,  111, 112, 119, 117, 115, 118, 65,  60,  59,  58,  68,  73,
+          79,  92,  97,  105, 118, 119, 123, 123, 122, 119, 69,  63,  62,  62,
+          71,  76,  83,  96,  100, 109, 122, 124, 127, 125, 125, 128, 71,  65,
+          64,  63,  73,  78,  84,  97,  102, 111, 125, 127, 135, 134, 131, 129,
+          79,  72,  71,  70,  79,  84,  90,  104, 109, 118, 133, 135, 137, 136,
+          136, 137, 81,  74,  72,  71,  80,  85,  91,  105, 110, 120, 135, 137,
+          145, 143, 141, 138, 82,  75,  73,  72,  81,  86,  92,  106, 111, 121,
+          136, 139, 147, 148, 147, 149, 87,  79,  77,  76,  85,  90,  96,  110,
+          114, 125, 140, 143, 148, 154, 151, 149, 90,  82,  80,  78,  87,  89,
+          99,  108, 113, 129, 135, 146, 153, 157, 160, 159, 92,  84,  83,  81,
+          88,  90,  102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95,  87,
+          85,  83,  88,  92,  103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+          98,  89,  88,  85,  89,  95,  103, 108, 121, 124, 141, 144, 160, 164,
+          169, 174, 100, 92,  91,  88,  90,  98,  103, 111, 120, 127, 139, 146,
+          161, 165, 175, 179, 103, 94,  94,  90,  92,  101, 103, 114, 119, 131,
+          137, 150, 158, 170, 175, 180, 106, 97,  97,  93,  93,  104, 104, 118,
+          118, 135, 135, 154, 155, 175, 176, 187},
+         {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68,  69,  69,
+          31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64,  65,  66,
+          31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63,  65,  66,
+          30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61,  62,  63,
+          33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60,  62,  63,
+          35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60,  61,  61,
+          37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58,  60,  61,
+          42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,  59,  59,
+          44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61,  61,  60,
+          49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62,  62,  63,
+          48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64,  64,  63,
+          48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66,  65,  66,
+          49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67,  68,  66,
+          50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71,  68,  70,
+          51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,  72,  70,
+          52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75,  73,  73,
+          54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,  76,  74,
+          54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79,  77,  78,
+          57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82,  81,  78,
+          59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83,  82,  82,
+          60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87,  85,  82,
+          63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88,  87,  86,
+          64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91,  89,  87,
+          64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93,  92,  91,
+          66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95,  93,  91,
+          67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,  97,  95,
+          68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97,  98,  96,
+          69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99,  100, 101,
+          70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99,  100, 101,
+          71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99,  102, 103,
+          72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+          73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+        {{32,  31,  31,  32,  35,  36,  44,  47,  53,  62,  65,  79,  82,  88,
+          90,  93,  31,  32,  32,  32,  35,  35,  42,  45,  51,  59,  62,  75,
+          78,  83,  86,  88,  31,  32,  32,  32,  34,  35,  41,  45,  50,  58,
+          61,  74,  76,  82,  85,  88,  31,  32,  32,  33,  34,  34,  41,  44,
+          49,  57,  59,  72,  74,  79,  82,  84,  31,  32,  33,  34,  35,  36,
+          42,  44,  49,  57,  59,  71,  73,  79,  81,  84,  32,  32,  33,  34,
+          36,  36,  42,  45,  50,  57,  59,  71,  73,  78,  80,  82,  32,  33,
+          34,  35,  37,  38,  42,  45,  49,  56,  58,  69,  71,  76,  79,  83,
+          32,  33,  34,  36,  39,  40,  44,  47,  51,  58,  60,  71,  73,  76,
+          78,  80,  34,  34,  35,  37,  41,  42,  48,  50,  54,  61,  63,  73,
+          76,  81,  81,  80,  35,  34,  36,  38,  45,  47,  52,  55,  59,  65,
+          67,  77,  79,  82,  83,  86,  36,  34,  36,  38,  46,  48,  54,  56,
+          60,  66,  68,  78,  80,  85,  87,  86,  39,  37,  39,  40,  48,  50,
+          58,  60,  65,  71,  73,  84,  86,  89,  88,  91,  41,  39,  40,  41,
+          49,  51,  60,  62,  67,  74,  76,  86,  88,  91,  93,  91,  44,  41,
+          42,  43,  51,  53,  63,  66,  71,  78,  79,  90,  92,  97,  94,  97,
+          47,  44,  44,  45,  53,  56,  66,  69,  75,  82,  84,  95,  97,  98,
+          101, 98,  48,  45,  45,  46,  54,  56,  67,  70,  76,  83,  85,  96,
+          98,  104, 101, 105, 53,  49,  50,  50,  57,  60,  71,  75,  82,  90,
+          92,  103, 106, 107, 108, 105, 55,  51,  51,  51,  59,  61,  72,  77,
+          84,  92,  94,  106, 108, 111, 110, 112, 58,  54,  54,  54,  61,  63,
+          75,  79,  87,  95,  98,  110, 112, 117, 116, 113, 63,  58,  58,  57,
+          65,  67,  78,  83,  91,  100, 103, 116, 118, 119, 119, 121, 65,  60,
+          59,  58,  66,  68,  79,  84,  92,  102, 105, 118, 120, 127, 124, 122,
+          71,  65,  64,  63,  71,  73,  84,  89,  97,  108, 111, 125, 127, 129,
+          129, 130, 74,  68,  67,  66,  73,  75,  86,  91,  100, 110, 113, 128,
+          131, 135, 134, 130, 79,  72,  71,  70,  77,  79,  90,  95,  104, 115,
+          118, 133, 136, 140, 139, 140, 82,  75,  73,  72,  79,  81,  92,  97,
+          105, 117, 120, 136, 139, 145, 142, 140, 82,  75,  74,  72,  79,  81,
+          92,  97,  106, 117, 121, 136, 139, 148, 150, 149, 87,  79,  78,  76,
+          83,  85,  96,  100, 110, 120, 125, 141, 144, 148, 153, 150, 89,  82,
+          81,  78,  83,  87,  97,  99,  113, 118, 128, 139, 145, 153, 157, 161,
+          92,  84,  83,  80,  84,  89,  97,  101, 114, 116, 132, 135, 150, 153,
+          157, 162, 94,  86,  85,  82,  85,  92,  97,  104, 112, 119, 130, 136,
+          151, 154, 163, 166, 97,  88,  88,  85,  86,  94,  97,  107, 111, 123,
+          128, 140, 147, 159, 163, 167, 99,  91,  91,  87,  87,  97,  97,  110,
+          110, 126, 126, 144, 144, 163, 163, 173},
+         {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+          31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+          34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+          40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+          46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+          47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+          45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+          47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+          50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+          55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+          57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+          64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+          66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+          70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+          71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+          72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+          53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+          50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+          51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+          50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+          57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+          59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+          67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+          71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+          75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+          82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+          85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+          88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+          94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+          97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+          98, 102}},
+        {{32,  31,  31,  32,  33,  36,  40,  44,  51,  53,  65,  66,  79,  81,
+          87,  90,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,  62,  63,
+          75,  77,  83,  85,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,
+          61,  62,  74,  76,  82,  85,  31,  32,  32,  33,  33,  34,  38,  41,
+          47,  49,  59,  60,  72,  74,  79,  81,  31,  32,  32,  33,  34,  35,
+          38,  41,  47,  49,  59,  60,  71,  73,  79,  81,  32,  32,  33,  34,
+          35,  36,  39,  42,  48,  50,  59,  60,  71,  72,  78,  80,  32,  32,
+          33,  35,  36,  37,  40,  42,  48,  49,  58,  59,  69,  71,  77,  80,
+          32,  33,  33,  35,  36,  38,  41,  42,  48,  49,  58,  59,  69,  70,
+          75,  77,  33,  33,  34,  36,  38,  41,  44,  46,  52,  53,  62,  63,
+          72,  74,  78,  78,  34,  34,  34,  37,  39,  42,  45,  48,  53,  54,
+          63,  64,  73,  75,  80,  83,  36,  34,  35,  38,  42,  48,  51,  54,
+          58,  60,  68,  69,  78,  80,  84,  83,  36,  35,  35,  38,  42,  48,
+          51,  54,  59,  60,  68,  69,  79,  80,  85,  87,  39,  37,  38,  40,
+          44,  50,  54,  58,  63,  65,  73,  74,  84,  85,  89,  88,  40,  38,
+          39,  41,  45,  51,  56,  59,  65,  67,  75,  76,  85,  87,  90,  93,
+          44,  41,  41,  43,  46,  53,  59,  63,  69,  71,  79,  80,  90,  91,
+          96,  93,  46,  43,  43,  44,  48,  55,  60,  65,  72,  73,  82,  83,
+          93,  94,  97,  100, 48,  45,  45,  46,  50,  56,  62,  67,  74,  76,
+          85,  86,  96,  98,  103, 100, 52,  48,  48,  49,  52,  59,  65,  70,
+          78,  80,  90,  91,  101, 103, 105, 107, 53,  49,  49,  50,  53,  60,
+          66,  71,  79,  82,  92,  93,  103, 105, 111, 107, 58,  53,  53,  53,
+          57,  63,  69,  74,  83,  86,  97,  98,  109, 111, 113, 115, 58,  54,
+          54,  54,  57,  63,  70,  75,  84,  87,  98,  99,  110, 112, 118, 115,
+          65,  60,  59,  58,  62,  68,  74,  79,  89,  92,  105, 106, 118, 119,
+          122, 123, 66,  61,  60,  59,  63,  69,  75,  80,  90,  93,  106, 107,
+          119, 121, 126, 123, 71,  65,  65,  63,  67,  73,  79,  84,  94,  97,
+          111, 112, 125, 127, 131, 132, 74,  68,  67,  66,  69,  75,  81,  86,
+          97,  100, 113, 115, 128, 130, 134, 132, 79,  72,  72,  70,  73,  79,
+          85,  90,  101, 104, 118, 119, 133, 135, 141, 140, 81,  74,  73,  71,
+          75,  80,  86,  91,  102, 105, 120, 121, 135, 137, 143, 140, 82,  75,
+          74,  72,  75,  81,  87,  92,  103, 106, 121, 122, 136, 139, 147, 151,
+          86,  78,  78,  75,  78,  84,  90,  95,  106, 109, 124, 125, 140, 142,
+          147, 151, 88,  81,  80,  77,  80,  86,  90,  98,  105, 112, 122, 127,
+          140, 144, 152, 155, 91,  83,  82,  79,  80,  88,  90,  100, 103, 114,
+          119, 130, 137, 148, 151, 155, 93,  85,  85,  81,  81,  90,  90,  102,
+          103, 117, 117, 134, 134, 151, 152, 160},
+         {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+          31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+          33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+          40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+          43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+          47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+          46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+          45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+          49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+          50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+          57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+          57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+          64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+          65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+          69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+          70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+          51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+          48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+          49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+          49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+          52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+          57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+          62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+          66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+          73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+          75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+          83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+          85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+          91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+          94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+          95, 98}},
+        {{32,  31,  31,  32,  32,  36,  36,  44,  44,  53,  53,  65,  65,  79,
+          79,  87,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,  51,  62,
+          62,  75,  75,  82,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,
+          51,  62,  62,  75,  75,  82,  31,  32,  32,  33,  33,  34,  34,  41,
+          41,  49,  49,  59,  59,  72,  72,  78,  31,  32,  32,  33,  33,  34,
+          34,  41,  41,  49,  49,  59,  59,  72,  72,  78,  32,  32,  32,  34,
+          34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,  32,  32,
+          32,  34,  34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,
+          32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,  58,  69,
+          69,  75,  32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,
+          58,  69,  69,  75,  34,  34,  34,  37,  37,  42,  42,  48,  48,  54,
+          54,  63,  63,  73,  73,  79,  34,  34,  34,  37,  37,  42,  42,  48,
+          48,  54,  54,  63,  63,  73,  73,  79,  36,  34,  34,  38,  38,  48,
+          48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  36,  34,  34,  38,
+          38,  48,  48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  39,  37,
+          37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,  84,  89,
+          39,  37,  37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,
+          84,  89,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,  71,  79,
+          79,  90,  90,  95,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,
+          71,  79,  79,  90,  90,  95,  48,  45,  45,  46,  46,  56,  56,  67,
+          67,  76,  76,  85,  85,  96,  96,  102, 48,  45,  45,  46,  46,  56,
+          56,  67,  67,  76,  76,  85,  85,  96,  96,  102, 53,  49,  49,  50,
+          50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109, 53,  49,
+          49,  50,  50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109,
+          58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,  98,  110,
+          110, 116, 58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,
+          98,  110, 110, 116, 65,  60,  60,  58,  58,  68,  68,  79,  79,  92,
+          92,  105, 105, 118, 118, 124, 65,  60,  60,  58,  58,  68,  68,  79,
+          79,  92,  92,  105, 105, 118, 118, 124, 71,  65,  65,  63,  63,  73,
+          73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 71,  65,  65,  63,
+          63,  73,  73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 79,  72,
+          72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133, 133, 141,
+          79,  72,  72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133,
+          133, 141, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106, 106, 121,
+          121, 136, 136, 144, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106,
+          106, 121, 121, 136, 136, 144, 87,  79,  79,  76,  76,  84,  84,  96,
+          96,  109, 109, 124, 124, 141, 141, 149},
+         {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+          31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+          31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+          40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+          40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+          47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+          47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+          45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+          45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+          50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+          50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+          57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+          57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+          64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+          64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+          69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+          50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+          46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+          48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+          47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+          49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+          55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+          56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+          64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+          65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+          72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+          75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+          82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+          83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+          90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+          92, 95}},
+        {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53,  53,  62,  65,  73,  79,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  60,  62,  70,  75,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  59,  62,  69,  75,
+          31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50,  50,  58,  60,  67,  73,
+          31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49,  49,  57,  59,  66,  72,
+          31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49,  49,  57,  59,  66,  71,
+          32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50,  50,  57,  59,  65,  71,
+          32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49,  49,  56,  59,  65,  70,
+          32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49,  49,  56,  58,  64,  69,
+          32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50,  50,  56,  58,  64,  69,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59,  59,  65,  67,  73,  77,
+          36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60,  60,  66,  68,  74,  78,
+          38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63,  63,  69,  71,  77,  81,
+          39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65,  65,  71,  73,  79,  84,
+          41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67,  67,  74,  76,  81,  86,
+          44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71,  71,  78,  79,  85,  90,
+          44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72,  72,  79,  81,  86,  91,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81,  81,  89,  91,  98,  103,
+          53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82,  82,  90,  92,  99,  103,
+          57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85,  85,  94,  96,  103, 108,
+          58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,  87,  95,  98,  105, 110,
+          61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89,  89,  98,  101, 108, 114,
+          65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92,  92,  102, 105, 112, 118,
+          67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94,  94,  103, 106, 114, 120,
+          71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97,  97,  108, 111, 119, 125,
+          72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98,  98,  108, 111, 119, 125,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+         {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+          31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+          31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+          37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+          40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+          44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+          47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+          47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+          45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+          46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+          50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+          50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+          55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+          57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+          61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+          64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+          49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+          46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+          46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+          46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+          47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+          50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+          55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+          57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+          63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+          64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+          71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+          72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+          78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+          82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+          86, 89}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58,  65,  65,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56,  63,  63,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55,  62,  62,
+          31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54,  61,  61,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56,  62,  62,
+          34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57,  63,  63,
+          34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59,  65,  65,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,  72,  72,
+          39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68,  73,  73,
+          41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70,  76,  76,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78,  84,  84,
+          48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79,  85,  85,
+          50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82,  88,  88,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90,  97,  97,
+          58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91,  98,  98,
+          61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93,  100, 100,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+         {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+          31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+          31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+          34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+          40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+          40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+          46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+          47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+          46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+          49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+          50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+          53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+          57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+          57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+          48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+          46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+          45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+          45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+          46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+          46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+          52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+          54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+          57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+          62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+          62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+          68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+          70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+          73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+          78, 78}},
+        {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+          32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+          32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+          32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+          32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+          33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+          33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+          36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+          38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+          42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+          42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+          47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+          54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+          54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+          61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+          36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+          37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+          37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+          38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+          42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+          43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+          45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+          54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+          56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+          60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+          71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+          71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+          76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+          87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+          87, 92},
+         {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+          31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+          31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+          32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+          38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+          40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+          41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+          46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+          47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+          47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+          47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+          50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+          50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+          54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+          49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+          47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+          46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+          46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+          46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+          46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+          47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+          53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+          54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+          55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+          61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+          61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+          63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+          68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+          68, 71}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+          31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+          32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+          32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+          32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+          33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+          35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+          36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+          40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+          42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+          42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+          47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+          54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+          34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+          35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+          34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+          34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+          37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+          40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+          40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+          43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+          51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+          53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+          54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+          61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+          67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+          67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+          73, 79},
+         {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+          31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+          31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+          31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+          34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+          40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+          40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+          41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+          46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+          47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+          47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+          47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+          50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+          43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+          46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+          46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+          46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+          46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+          47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+          47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+          47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+          51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+          53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+          53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+          57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+          59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+          59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+          62, 65}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+          36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+          36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+          36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+          40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+          42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+          42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+          33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+          34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+          34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+          34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+          34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+          36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+          38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+          38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+          39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+          45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+          50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+          50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+          52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+          58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+          63, 63},
+         {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+          31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+          31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+          31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+          31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+          35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+          40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+          40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+          40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+          44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+          47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+          47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+          47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+          39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+          42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+          43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+          43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+          44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+          47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+          48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+          48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+          47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+          50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+          53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+          53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+          53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+          56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+          58, 58}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+          36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+          32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+          32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+          33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+          33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+          35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+          36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+          37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+          38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+          42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+          46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+          48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+          49, 49},
+         {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+          31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+          31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+          31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+          31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+          31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+          34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+          38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+          40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+          40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+          40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+          43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+          46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+          47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+          47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+          47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+          36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+          38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+          39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+          40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+          41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+          42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+          44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+          46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+          47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+          48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+          50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+          52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+          53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+          53, 53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+          34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+          33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+          34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+          35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+          36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+          38, 39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+          31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+          31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+          31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+          31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+          31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+          31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+          32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+          34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+          37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+          40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+          40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+          40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+          40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+          41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+          44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+          33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+          34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+          35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+          35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+          37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+          38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+          39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+          41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+          43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+          45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+          47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+          47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+          47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+          47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+          48, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          34, 34},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+          34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+          36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+          38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+          40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+          30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+          32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+          34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+          36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+          36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+          36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+          37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+          38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+          40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+          42, 44}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32}}};
+constexpr uint8_t
+    kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+                       [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+                                {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+                               {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+                                {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+                               {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+                                {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+                               {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+                                {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+                               {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+                                {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+                               {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+                                {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+                               {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+                                {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+                               {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+                                {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+                               {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+                                {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+                               {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+                                {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+                               {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+                                {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+                               {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+                                {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+                               {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+                                {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+                               {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+                                {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+                               {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+                                {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+        {{32,  32,  35,  38,  40,  54,  51,  49,  65,  82,  68,  63,
+          78,  97,  117, 84,  76,  91,  111, 134, 152, 95,  89,  98,
+          113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+         {31, 38, 47, 47,  46, 54, 50, 47, 57, 66, 57,  52,
+          61, 72, 82, 63,  57, 66, 77, 88, 96, 67, 62,  67,
+          75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+        {{32,  32,  35,  37,  39,  51, 47,  46,  60,  73,  62,  58,
+          71,  87,  105, 78,  72,  84, 100, 121, 140, 90,  84,  93,
+          106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+         {31, 38, 47, 47,  47, 53, 48, 46, 55, 62, 54,  50,
+          58, 67, 76, 61,  55, 63, 72, 83, 91, 66, 61,  65,
+          73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+        {{32,  32,  34,  35,  37, 48, 46, 45,  56,  70,  57,  54,
+          64,  80,  93,  76,  70, 79, 96, 111, 134, 85,  79,  87,
+          100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+         {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+          55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+          71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+        {{32, 32,  33,  35,  36, 46, 42, 42,  52,  63,  53,  51,
+          60, 73,  86,  68,  64, 72, 84, 100, 117, 78,  74,  80,
+          92, 109, 128, 140, 90, 84, 87, 98,  114, 133, 155, 168},
+         {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+          54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+          68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+        {{32, 32,  33,  34,  35, 39, 39, 40, 46,  56,  50,  48,
+          53, 65,  78,  62,  59, 63, 75, 90, 105, 76,  71,  74,
+          86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+          50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+          65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+        {{32, 32, 33,  34,  35, 39, 38, 39, 45, 54,  46,  45,
+          51, 61, 71,  56,  54, 58, 69, 80, 92, 68,  64,  68,
+          78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+          50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+          61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+        {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42,  41,
+          43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56,  56,
+          66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+         {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+          45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+          56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+        {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+          40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+          56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+         {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+          46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+          52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+        {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+          37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+          51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+         {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+          47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+          50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+        {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+          35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+          42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+         {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+          45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+          45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+        {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+          34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+          38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+         {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+          42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+          47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+        {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+          32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+          35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+         {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+          39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+          45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+          32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+         {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+          35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+          40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  33,
+          33,  32,  32,  32,  33,  34,  35,  34,  34,  33,  34,  35,  37,  39,
+          35,  34,  34,  35,  36,  37,  41,  43,  36,  35,  34,  35,  36,  38,
+          42,  45,  48,  39,  38,  37,  38,  39,  40,  45,  47,  50,  54,  44,
+          42,  41,  41,  42,  42,  47,  50,  54,  58,  63,  46,  44,  42,  43,
+          44,  44,  49,  52,  55,  59,  65,  67,  48,  46,  44,  45,  45,  46,
+          51,  53,  57,  61,  67,  69,  71,  54,  51,  49,  49,  50,  49,  54,
+          57,  60,  65,  71,  74,  76,  82,  59,  56,  54,  54,  54,  53,  58,
+          61,  64,  69,  75,  78,  80,  87,  92,  62,  59,  56,  56,  56,  55,
+          60,  63,  66,  71,  77,  80,  83,  89,  95,  98,  65,  62,  59,  59,
+          59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98,  101, 105, 71,
+          68,  65,  64,  64,  63,  68,  70,  73,  78,  84,  87,  90,  97,  103,
+          107, 111, 117, 80,  76,  72,  72,  71,  69,  74,  76,  79,  84,  90,
+          93,  96,  104, 110, 114, 118, 125, 134, 81,  77,  73,  73,  72,  70,
+          75,  77,  80,  85,  91,  94,  97,  105, 111, 115, 119, 126, 135, 137,
+          83,  78,  75,  74,  74,  72,  76,  79,  81,  86,  92,  95,  99,  106,
+          113, 117, 121, 128, 137, 138, 140, 88,  84,  80,  79,  78,  76,  80,
+          82,  85,  91,  95,  98,  103, 111, 115, 119, 126, 134, 139, 144, 147,
+          152, 91,  86,  83,  82,  81,  79,  81,  84,  88,  92,  95,  100, 107,
+          110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94,  89,  86,  85,
+          84,  82,  82,  86,  90,  92,  97,  103, 105, 111, 119, 121, 128, 136,
+          139, 146, 156, 158, 161, 166, 97,  92,  90,  88,  86,  85,  84,  89,
+          91,  95,  100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+          166, 168, 174, 101, 95,  93,  91,  89,  89,  87,  91,  93,  98,  101,
+          105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+          183, 104, 99,  97,  94,  93,  93,  90,  92,  96,  100, 102, 108, 111,
+          116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+          107, 102, 101, 97,  96,  96,  93,  93,  99,  101, 105, 110, 113, 120,
+          122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+          111, 105, 104, 101, 100, 99,  97,  96,  102, 103, 109, 111, 117, 120,
+          125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+          210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+          121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+          210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+          117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+          197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+          107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+          177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+         {32,  31,  31,  30,  31,  32,  32,  33,  33,  35,  33,  34,  35,  37,
+          39,  36,  38,  40,  41,  43,  47,  41,  42,  42,  43,  45,  47,  48,
+          45,  45,  44,  45,  46,  47,  49,  50,  49,  47,  46,  47,  47,  48,
+          50,  51,  53,  48,  47,  45,  46,  46,  46,  49,  51,  53,  54,  49,
+          47,  45,  45,  45,  45,  49,  51,  53,  55,  58,  50,  47,  45,  46,
+          46,  46,  49,  51,  54,  56,  59,  60,  50,  48,  46,  46,  46,  46,
+          50,  52,  54,  56,  60,  60,  61,  52,  50,  47,  47,  47,  47,  50,
+          52,  54,  57,  61,  62,  63,  66,  54,  52,  49,  49,  49,  48,  52,
+          53,  55,  58,  62,  64,  65,  68,  71,  56,  53,  51,  50,  50,  49,
+          52,  54,  56,  59,  63,  64,  66,  69,  72,  73,  57,  54,  52,  51,
+          51,  50,  53,  55,  56,  60,  63,  65,  67,  70,  73,  75,  76,  60,
+          57,  54,  54,  53,  52,  55,  57,  58,  61,  65,  67,  68,  72,  75,
+          77,  79,  82,  63,  60,  57,  57,  56,  54,  57,  59,  60,  63,  67,
+          69,  71,  75,  78,  80,  82,  85,  89,  64,  61,  58,  57,  57,  55,
+          58,  59,  61,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  90,
+          65,  61,  58,  58,  57,  55,  58,  60,  61,  64,  68,  70,  71,  75,
+          79,  81,  83,  86,  90,  91,  91,  67,  63,  61,  60,  59,  57,  60,
+          61,  63,  66,  69,  70,  73,  77,  79,  81,  85,  88,  90,  92,  94,
+          96,  68,  64,  62,  61,  60,  58,  59,  61,  64,  66,  67,  71,  74,
+          75,  78,  82,  84,  86,  90,  93,  94,  96,  98,  69,  65,  63,  62,
+          61,  59,  59,  62,  64,  65,  68,  71,  72,  75,  79,  80,  83,  87,
+          89,  92,  96,  97,  98,  100, 70,  66,  64,  63,  62,  61,  60,  63,
+          64,  66,  69,  70,  73,  76,  77,  81,  84,  85,  89,  92,  93,  98,
+          99,  100, 102, 71,  67,  66,  64,  63,  62,  61,  63,  64,  67,  68,
+          70,  74,  75,  78,  81,  83,  86,  88,  91,  94,  95,  100, 101, 102,
+          104, 72,  68,  67,  65,  64,  64,  61,  63,  65,  67,  68,  71,  73,
+          75,  78,  79,  84,  85,  88,  91,  93,  97,  98,  102, 103, 104, 106,
+          73,  69,  68,  66,  65,  65,  63,  63,  66,  67,  69,  71,  73,  76,
+          77,  81,  82,  85,  88,  90,  94,  95,  99,  101, 104, 105, 106, 109,
+          74,  70,  70,  67,  66,  66,  64,  63,  66,  67,  70,  71,  74,  75,
+          78,  80,  82,  86,  87,  91,  92,  96,  98,  101, 104, 106, 108, 108,
+          111, 75,  71,  71,  68,  68,  67,  66,  64,  66,  68,  70,  71,  74,
+          75,  79,  79,  84,  84,  88,  90,  93,  95,  98,  101, 103, 107, 108,
+          110, 111, 113, 76,  72,  72,  69,  69,  68,  67,  65,  66,  69,  70,
+          72,  74,  76,  78,  81,  83,  85,  88,  90,  93,  95,  98,  100, 104,
+          105, 109, 111, 112, 113, 116, 78,  74,  74,  70,  70,  69,  69,  66,
+          66,  70,  70,  74,  74,  77,  78,  82,  82,  86,  87,  92,  92,  96,
+          97,  102, 102, 107, 107, 112, 113, 115, 115, 118}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  33,  34,  35,  32,  33,  33,  33,  34,  36,  36,
+          34,  34,  33,  34,  35,  37,  38,  39,  36,  35,  34,  35,  36,  38,
+          40,  42,  48,  38,  37,  36,  36,  38,  39,  41,  44,  50,  51,  39,
+          38,  37,  38,  39,  40,  42,  45,  50,  52,  54,  44,  42,  41,  41,
+          42,  42,  44,  47,  54,  56,  58,  63,  47,  45,  44,  44,  45,  45,
+          47,  50,  56,  58,  60,  66,  69,  49,  47,  46,  45,  46,  46,  48,
+          51,  57,  60,  62,  68,  71,  73,  54,  51,  50,  49,  50,  49,  51,
+          54,  60,  63,  65,  71,  75,  77,  82,  59,  56,  54,  54,  54,  53,
+          55,  58,  64,  67,  69,  75,  79,  81,  87,  92,  61,  58,  56,  56,
+          56,  55,  57,  60,  65,  68,  70,  77,  81,  83,  89,  94,  97,  65,
+          62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  87,  92,
+          98,  101, 105, 71,  68,  65,  65,  64,  63,  65,  68,  73,  76,  78,
+          84,  89,  92,  97,  103, 106, 111, 117, 76,  72,  70,  69,  68,  66,
+          68,  71,  76,  79,  81,  88,  92,  95,  101, 107, 110, 115, 122, 127,
+          80,  76,  73,  72,  71,  69,  71,  74,  79,  82,  84,  90,  95,  98,
+          104, 110, 113, 118, 125, 130, 134, 83,  78,  76,  75,  74,  72,  73,
+          76,  81,  84,  86,  92,  97,  100, 106, 113, 116, 121, 128, 133, 137,
+          140, 86,  82,  79,  78,  77,  74,  76,  79,  84,  87,  89,  95,  100,
+          103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89,  85,  82,  81,
+          79,  78,  78,  82,  86,  87,  92,  97,  100, 105, 112, 114, 120, 128,
+          131, 136, 146, 147, 150, 155, 92,  88,  85,  84,  82,  81,  80,  85,
+          86,  90,  95,  97,  102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+          154, 156, 162, 95,  90,  88,  86,  85,  84,  82,  86,  88,  93,  95,
+          99,  105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+          169, 98,  93,  91,  89,  88,  87,  85,  87,  90,  94,  96,  102, 104,
+          109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+          101, 96,  95,  92,  91,  90,  88,  88,  93,  95,  99,  103, 106, 112,
+          114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+          104, 99,  98,  95,  94,  93,  91,  90,  95,  96,  102, 103, 109, 112,
+          117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+          193, 108, 102, 101, 98,  97,  96,  95,  93,  97,  100, 104, 106, 111,
+          113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+          192, 194, 201, 111, 105, 105, 101, 100, 99,  98,  96,  98,  103, 105,
+          109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+          181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+          100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+          162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+         {32,  31,  31,  30, 31, 31,  31, 32, 32, 33,  33,  34,  35,  36,  39,
+          36,  38,  39,  40, 43, 47,  38, 40, 41, 41,  44,  47,  47,  41,  42,
+          42,  43,  45,  47, 48, 48,  49, 47, 46, 46,  47,  48,  49,  50,  53,
+          49,  47,  46,  46, 46, 47,  48, 50, 53, 53,  48,  47,  46,  45,  46,
+          46,  48,  49,  53, 54, 54,  49, 47, 45, 45,  45,  45,  47,  49,  53,
+          55,  55,  58,  50, 48, 46,  46, 46, 46, 47,  50,  54,  55,  56,  59,
+          61,  51,  48,  47, 46, 47,  46, 47, 50, 54,  55,  56,  60,  61,  62,
+          52,  50,  48,  47, 47, 47,  48, 50, 54, 56,  57,  61,  63,  64,  66,
+          54,  52,  50,  49, 49, 48,  49, 52, 55, 57,  58,  62,  64,  66,  68,
+          71,  55,  53,  51, 50, 50,  49, 50, 52, 56,  58,  59,  63,  65,  66,
+          69,  72,  73,  57, 54, 52,  51, 51, 50, 51,  53,  56,  58,  60,  63,
+          66,  67,  70,  73, 74, 76,  60, 57, 55, 54,  53,  52,  53,  55,  58,
+          60,  61,  65,  68, 69, 72,  75, 77, 79, 82,  62,  59,  57,  56,  55,
+          53,  54,  56,  59, 61, 63,  66, 69, 70, 74,  77,  78,  80,  84,  86,
+          63,  60,  58,  57, 56, 54,  55, 57, 60, 62,  63,  67,  70,  71,  75,
+          78,  79,  82,  85, 87, 89,  65, 61, 59, 58,  57,  55,  56,  58,  61,
+          63,  64,  68,  71, 72, 75,  79, 80, 83, 86,  88,  90,  91,  66,  63,
+          60,  59,  58,  56, 58, 59,  62, 64, 65, 69,  72,  73,  76,  80,  81,
+          84,  87,  90,  91, 93, 94,  67, 64, 62, 61,  59,  58,  58,  60,  63,
+          64,  66,  69,  71, 73, 77,  78, 81, 85, 86,  89,  93,  94,  95,  97,
+          68,  65,  63,  62, 60, 59,  58, 61, 62, 64,  67,  68,  71,  74,  75,
+          79,  81,  83,  87, 89, 91,  95, 96, 97, 99,  69,  66,  64,  63,  61,
+          61,  59,  61,  62, 65, 66,  68, 72, 73, 76,  78,  80,  84,  85,  88,
+          91,  92,  97,  98, 98, 101, 70, 67, 65, 63,  62,  62,  60,  61,  63,
+          65,  66,  69,  71, 73, 76,  77, 81, 83, 85,  88,  90,  94,  95,  99,
+          100, 100, 103, 71, 67, 67,  64, 63, 63, 61,  61,  64,  65,  67,  69,
+          71,  74,  75,  78, 80, 83,  85, 87, 91, 92,  95,  97,  100, 102, 102,
+          105, 72,  68,  68, 65, 65,  64, 62, 62, 64,  65,  68,  69,  72,  73,
+          76,  78,  80,  83, 84, 88,  89, 93, 95, 97,  100, 102, 104, 104, 107,
+          73,  69,  69,  66, 66, 65,  64, 63, 64, 66,  68,  69,  72,  73,  77,
+          77,  81,  82,  86, 87, 90,  92, 95, 97, 99,  103, 104, 106, 106, 109,
+          74,  70,  70,  67, 67, 66,  65, 63, 64, 67,  68,  70,  72,  74,  76,
+          78,  80,  82,  85, 87, 90,  91, 95, 96, 100, 101, 105, 106, 108, 108,
+          111, 75,  71,  71, 68, 68,  66, 66, 64, 64,  68,  68,  71,  71,  75,
+          75,  79,  79,  83, 84, 88,  89, 93, 93, 98,  98,  102, 103, 108, 108,
+          110, 110, 113}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  32,  33,  34,  32,  32,  32,  32,  34,  34,  35,
+          34,  34,  33,  33,  35,  36,  37,  39,  34,  34,  34,  34,  36,  36,
+          37,  41,  42,  36,  35,  34,  34,  36,  37,  38,  42,  45,  48,  39,
+          38,  38,  37,  39,  40,  40,  45,  47,  50,  54,  41,  39,  39,  38,
+          40,  40,  41,  46,  48,  51,  55,  56,  44,  42,  41,  41,  42,  42,
+          42,  47,  50,  54,  58,  59,  63,  48,  46,  45,  44,  45,  45,  45,
+          50,  53,  56,  61,  62,  66,  70,  49,  47,  46,  45,  46,  46,  46,
+          51,  53,  57,  62,  63,  68,  71,  73,  54,  51,  50,  49,  50,  49,
+          49,  54,  56,  60,  65,  67,  71,  76,  77,  82,  58,  55,  54,  53,
+          53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  59,
+          57,  55,  54,  54,  54,  54,  59,  61,  64,  69,  71,  75,  80,  82,
+          87,  91,  93,  65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,
+          75,  79,  85,  87,  92,  97,  99,  105, 69,  66,  64,  63,  63,  62,
+          61,  66,  68,  71,  76,  78,  83,  88,  90,  96,  100, 102, 109, 113,
+          71,  68,  66,  65,  64,  63,  63,  68,  70,  73,  78,  80,  84,  90,
+          92,  97,  102, 104, 111, 115, 117, 80,  76,  73,  72,  71,  70,  69,
+          74,  76,  79,  84,  86,  90,  96,  98,  104, 109, 111, 118, 123, 125,
+          134, 81,  77,  75,  74,  73,  72,  71,  75,  77,  80,  85,  87,  91,
+          97,  99,  105, 110, 112, 120, 125, 127, 136, 137, 83,  78,  76,  75,
+          74,  73,  72,  76,  78,  81,  86,  88,  92,  98,  100, 106, 111, 113,
+          121, 126, 128, 137, 139, 140, 87,  83,  81,  79,  78,  77,  75,  80,
+          82,  85,  90,  91,  96,  101, 103, 110, 114, 117, 125, 129, 133, 142,
+          143, 145, 150, 90,  85,  83,  81,  80,  79,  78,  81,  83,  87,  89,
+          93,  98,  100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+          156, 93,  88,  86,  84,  83,  82,  80,  82,  85,  89,  90,  96,  98,
+          102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+          95,  90,  89,  86,  85,  85,  83,  83,  88,  89,  93,  97,  99,  105,
+          106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+          98,  93,  92,  89,  88,  87,  86,  85,  89,  90,  96,  97,  102, 105,
+          109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+          176, 101, 96,  95,  91,  91,  90,  89,  87,  90,  93,  97,  99,  104,
+          105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+          176, 177, 184, 104, 99,  98,  94,  94,  92,  92,  90,  92,  96,  98,
+          102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+          166, 177, 179, 184, 185, 191, 107, 101, 101, 97,  97,  95,  95,  93,
+          93,  99,  99,  105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+          149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+         {32,  31,  31, 30, 31, 31, 30, 31, 31, 32, 33, 34,  35,  35,  39,
+          35,  36,  37, 37, 41, 43, 36, 38, 39, 40, 43, 45,  47,  41,  42,
+          42,  42,  45, 46, 47, 48, 44, 44, 44, 44, 46, 46,  47,  49,  50,
+          49,  47,  47, 46, 47, 47, 48, 50, 51, 53, 48, 47,  46,  45,  46,
+          46,  46,  49, 51, 53, 54, 48, 47, 46, 45, 46, 46,  46,  49,  51,
+          53,  54,  55, 49, 47, 46, 45, 45, 45, 45, 49, 51,  53,  55,  56,
+          58,  50,  48, 47, 46, 46, 46, 46, 50, 51, 54, 56,  57,  59,  61,
+          51,  48,  47, 46, 47, 46, 46, 50, 51, 54, 56, 57,  60,  62,  62,
+          52,  50,  48, 47, 47, 47, 47, 50, 52, 54, 57, 58,  61,  63,  64,
+          66,  54,  51, 50, 49, 49, 48, 48, 51, 53, 55, 58,  59,  62,  64,
+          65,  68,  70, 55, 52, 51, 50, 49, 49, 48, 52, 53,  55,  59,  60,
+          62,  65,  66, 68, 70, 71, 57, 54, 53, 52, 51, 50,  50,  53,  54,
+          56,  60,  61, 63, 66, 67, 70, 73, 73, 76, 59, 56,  54,  53,  53,
+          52,  51,  54, 56, 58, 61, 62, 65, 68, 69, 72, 74,  75,  78,  80,
+          60,  57,  55, 54, 53, 53, 52, 55, 56, 58, 61, 63,  65,  68,  69,
+          72,  75,  76, 79, 81, 82, 63, 60, 58, 57, 56, 55,  54,  57,  59,
+          60,  63,  65, 67, 70, 71, 75, 77, 78, 82, 84, 85,  89,  64,  61,
+          59,  58,  57, 56, 55, 58, 59, 61, 64, 65, 68, 71,  72,  75,  78,
+          79,  82,  85, 86, 89, 90, 65, 61, 60, 58, 57, 56,  55,  58,  59,
+          61,  64,  65, 68, 71, 72, 75, 78, 79, 83, 85, 86,  90,  91,  91,
+          67,  63,  61, 60, 59, 58, 57, 60, 61, 63, 65, 66,  69,  72,  73,
+          77,  79,  80, 84, 86, 88, 92, 93, 93, 95, 68, 64,  63,  61,  60,
+          59,  58,  60, 61, 63, 65, 67, 70, 71, 74, 76, 78,  81,  83,  86,
+          88,  89,  94, 94, 95, 97, 68, 65, 64, 62, 61, 60,  58,  59,  61,
+          64,  64,  68, 69, 71, 74, 75, 79, 80, 83, 86, 87,  91,  92,  95,
+          96,  97,  99, 69, 66, 65, 63, 62, 61, 59, 59, 62,  63,  65,  67,
+          69,  72,  72, 76, 78, 80, 83, 84, 88, 89, 92, 94,  97,  98,  99,
+          101, 70,  67, 66, 63, 63, 62, 61, 60, 63, 63, 66,  67,  69,  71,
+          73,  76,  77, 81, 82, 85, 86, 90, 91, 94, 96, 99,  100, 100, 103,
+          71,  67,  67, 64, 64, 63, 62, 61, 62, 64, 66, 67,  70,  71,  74,
+          74,  78,  79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+          72,  68,  68, 65, 65, 64, 63, 61, 62, 65, 66, 68,  69,  71,  73,
+          75,  77,  79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+          106, 73,  69, 69, 66, 66, 64, 64, 62, 62, 66, 66,  69,  69,  72,
+          73,  76,  77, 81, 81, 85, 85, 89, 90, 94, 94, 99,  99,  104, 104,
+          106, 106, 108}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  34,  35,
+          32,  33,  33,  33,  34,  34,  36,  36,  34,  34,  34,  33,  35,  35,
+          37,  38,  39,  35,  35,  34,  34,  36,  36,  38,  39,  42,  46,  36,
+          35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  39,  38,  38,  37,
+          39,  39,  40,  42,  45,  49,  50,  54,  41,  40,  39,  38,  40,  40,
+          41,  43,  46,  50,  52,  55,  57,  44,  42,  42,  41,  42,  42,  42,
+          44,  47,  52,  54,  58,  60,  63,  47,  45,  45,  44,  44,  45,  45,
+          47,  50,  55,  56,  60,  62,  66,  69,  48,  46,  45,  44,  45,  45,
+          46,  47,  51,  55,  57,  61,  63,  67,  70,  71,  54,  51,  50,  49,
+          49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  56,
+          53,  52,  51,  51,  51,  51,  53,  56,  60,  61,  66,  69,  73,  77,
+          78,  84,  86,  59,  56,  55,  54,  54,  54,  53,  55,  58,  62,  64,
+          69,  71,  75,  79,  80,  87,  89,  92,  64,  61,  60,  58,  58,  58,
+          57,  59,  62,  66,  67,  72,  75,  79,  83,  84,  91,  93,  97,  102,
+          65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  75,  79,
+          84,  85,  92,  94,  98,  103, 105, 71,  68,  67,  65,  64,  64,  63,
+          65,  68,  72,  73,  78,  80,  84,  89,  90,  97,  100, 103, 109, 111,
+          117, 74,  71,  69,  68,  67,  67,  65,  67,  70,  74,  75,  80,  83,
+          86,  91,  93,  100, 102, 106, 112, 114, 120, 123, 80,  76,  74,  72,
+          71,  71,  69,  71,  74,  78,  79,  84,  86,  90,  95,  96,  104, 106,
+          110, 116, 118, 125, 128, 134, 82,  78,  76,  74,  73,  73,  71,  73,
+          76,  79,  80,  86,  88,  92,  97,  98,  106, 108, 112, 118, 120, 127,
+          131, 136, 139, 83,  78,  77,  75,  74,  74,  72,  73,  76,  80,  81,
+          86,  89,  92,  97,  99,  106, 109, 113, 119, 121, 128, 131, 137, 139,
+          140, 87,  83,  81,  79,  78,  78,  75,  77,  80,  83,  85,  90,  92,
+          96,  100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+          90,  85,  84,  81,  80,  80,  78,  78,  82,  84,  87,  91,  93,  98,
+          99,  106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+          92,  88,  87,  84,  83,  82,  80,  80,  84,  85,  90,  91,  95,  98,
+          102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+          162, 95,  90,  89,  86,  85,  84,  83,  82,  85,  87,  91,  92,  97,
+          98,  105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+          161, 162, 168, 97,  92,  92,  88,  88,  86,  86,  84,  85,  90,  91,
+          95,  97,  101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+          152, 162, 164, 168, 168, 174, 100, 95,  95,  90,  90,  89,  89,  86,
+          86,  92,  92,  97,  98,  104, 104, 111, 111, 119, 119, 128, 129, 137,
+          137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+         {32,  31,  31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34,  37,
+          33,  34,  35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38,  40,
+          40,  41,  43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,  48,
+          47,  46,  46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46,  47,
+          47,  48,  49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48,  49,
+          52,  53,  54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53,  55,
+          55,  49,  47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,  58,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,  61,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60,  61,
+          61,  52,  50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59,  61,
+          63,  63,  66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55,  58,
+          59,  62,  64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49,  52,
+          55,  55,  58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51,  51,
+          51,  49,  51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73,  75,
+          57,  54,  53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63,  66,
+          67,  70,  71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53,  55,
+          58,  58,  61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61,  58,
+          57,  55,  55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,  73,
+          74,  76,  79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55,  57,
+          60,  60,  63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86,  89,
+          64,  61,  60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,  70,
+          71,  75,  77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58,  57,
+          57,  55,  56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79,  82,
+          83,  86,  88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58,  60,
+          62,  63,  66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89,  92,
+          93,  93,  95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63,  65,
+          67,  70,  70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94,  95,
+          97,  68,  65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67,  69,
+          71,  73,  75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97,  99,
+          69,  65,  65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,  72,
+          72,  76,  76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98,  100,
+          70,  66,  66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69,  71,
+          73,  75,  77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+          102, 71,  67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,  70,
+          70,  74,  74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+          101, 101, 104}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  33,  34,
+          32,  32,  32,  32,  33,  34,  35,  35,  33,  33,  33,  33,  34,  35,
+          36,  36,  38,  34,  34,  34,  33,  34,  35,  36,  37,  39,  39,  36,
+          35,  35,  34,  35,  36,  37,  38,  42,  42,  48,  36,  35,  35,  34,
+          35,  36,  38,  38,  42,  43,  48,  49,  39,  38,  38,  37,  38,  39,
+          40,  40,  44,  45,  50,  51,  54,  41,  39,  39,  38,  39,  40,  40,
+          41,  45,  46,  51,  52,  55,  56,  44,  42,  42,  41,  41,  42,  42,
+          42,  46,  47,  54,  54,  58,  59,  63,  46,  44,  44,  42,  43,  44,
+          44,  44,  48,  49,  55,  55,  59,  61,  65,  67,  48,  46,  46,  44,
+          45,  45,  45,  46,  50,  51,  57,  57,  61,  63,  67,  69,  71,  52,
+          50,  49,  48,  48,  48,  48,  48,  52,  53,  59,  59,  64,  65,  70,
+          72,  74,  78,  54,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,
+          60,  65,  67,  71,  74,  76,  80,  82,  58,  56,  55,  53,  53,  53,
+          53,  53,  57,  58,  63,  64,  68,  70,  75,  77,  80,  84,  86,  91,
+          59,  56,  56,  54,  54,  54,  53,  53,  57,  58,  64,  64,  69,  70,
+          75,  78,  80,  85,  87,  91,  92,  65,  62,  61,  59,  59,  59,  58,
+          58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98,
+          105, 66,  63,  63,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,
+          76,  80,  83,  86,  91,  93,  98,  99,  106, 107, 71,  68,  67,  65,
+          65,  64,  63,  63,  67,  68,  73,  73,  78,  80,  84,  87,  90,  95,
+          97,  103, 103, 111, 112, 117, 74,  71,  70,  68,  67,  67,  66,  65,
+          69,  70,  75,  75,  80,  82,  86,  89,  93,  97,  100, 105, 106, 114,
+          115, 120, 123, 80,  76,  75,  72,  72,  71,  70,  69,  73,  74,  79,
+          79,  84,  86,  90,  93,  96,  101, 104, 110, 110, 118, 119, 125, 128,
+          134, 81,  77,  77,  74,  73,  73,  71,  71,  74,  75,  80,  80,  85,
+          87,  91,  94,  98,  103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+          83,  78,  78,  75,  74,  74,  72,  72,  75,  76,  81,  81,  86,  88,
+          92,  95,  99,  104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+          86,  82,  81,  78,  77,  77,  75,  74,  78,  79,  84,  84,  89,  91,
+          95,  98,  101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+          147, 89,  84,  84,  80,  80,  79,  78,  77,  79,  81,  85,  86,  91,
+          92,  97,  98,  104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+          148, 149, 153, 91,  86,  86,  82,  82,  81,  80,  79,  80,  84,  85,
+          88,  91,  94,  97,  100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+          140, 148, 150, 153, 154, 159, 93,  88,  88,  84,  84,  83,  83,  80,
+          81,  86,  86,  91,  91,  96,  97,  103, 103, 110, 110, 118, 119, 126,
+          126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+          35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+          46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+          46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+          47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+          46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+          53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+          58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+          50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+          52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+          65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+          63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+          62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+          56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+          50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+          54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+          73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+          63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+          53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+          83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+          69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+          56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+          86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+          65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+          62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+          79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+          57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+          89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+          64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+          96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+          68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+          99}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  33,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  34,  34,  35,  32,  32,  32,  32,  32,  34,
+          34,  35,  35,  34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  34,
+          34,  34,  33,  33,  35,  35,  37,  37,  39,  39,  36,  35,  35,  34,
+          34,  36,  36,  38,  38,  42,  42,  48,  36,  35,  35,  34,  34,  36,
+          36,  38,  38,  42,  42,  48,  48,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  54,  44,  42,  42,  41,  41,  42,
+          42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  44,  42,  42,  41,
+          41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  63,  48,
+          46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,
+          67,  67,  71,  48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,
+          57,  57,  61,  61,  67,  67,  71,  71,  54,  51,  51,  49,  49,  50,
+          50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,
+          54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,
+          65,  71,  71,  76,  76,  82,  82,  59,  56,  56,  54,  54,  54,  54,
+          53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,
+          92,  59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,
+          69,  69,  75,  75,  80,  80,  87,  87,  92,  92,  65,  62,  62,  59,
+          59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,
+          85,  92,  92,  98,  98,  105, 65,  62,  62,  59,  59,  59,  59,  58,
+          58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,
+          98,  105, 105, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,
+          73,  73,  78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111,
+          117, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,
+          78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111, 117, 117,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          134, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,
+          86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121, 128, 128,
+          137, 137, 140, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,
+          81,  81,  86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121,
+          128, 128, 137, 137, 140, 140, 87,  83,  83,  79,  79,  77,  77,  75,
+          75,  80,  80,  84,  84,  90,  90,  96,  96,  102, 102, 109, 109, 116,
+          116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+          34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+          43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+          45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+          47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+          48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+          53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+          54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+          50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+          61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+          60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+          57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+          54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+          48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+          52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+          68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+          60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+          51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+          76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+          65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+          53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+          79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+          63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+          60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+          75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+          55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+          83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+          61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+          90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+          66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+          95}},
+        {{32,  31,  31,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  32,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  33,  33,  34,  32,  32,  32,  32,  32,  33,
+          34,  34,  35,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  34,
+          34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  34,  34,  34,  33,
+          33,  34,  35,  35,  37,  37,  39,  39,  35,  35,  35,  34,  34,  35,
+          36,  36,  38,  38,  42,  42,  46,  36,  35,  35,  34,  34,  35,  36,
+          37,  38,  38,  42,  42,  47,  48,  38,  37,  37,  36,  36,  37,  38,
+          38,  39,  40,  44,  44,  48,  50,  51,  39,  38,  38,  38,  37,  38,
+          39,  39,  40,  41,  45,  45,  49,  50,  52,  54,  41,  40,  40,  39,
+          38,  39,  40,  40,  41,  41,  46,  46,  50,  52,  54,  55,  57,  44,
+          42,  42,  41,  41,  41,  42,  42,  42,  43,  47,  47,  52,  54,  56,
+          58,  60,  63,  45,  43,  43,  42,  41,  42,  42,  43,  43,  43,  48,
+          48,  53,  54,  57,  58,  60,  64,  65,  48,  46,  46,  45,  44,  45,
+          45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,
+          48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,
+          59,  61,  63,  67,  68,  71,  71,  53,  51,  51,  49,  49,  49,  49,
+          49,  49,  49,  54,  54,  58,  59,  62,  64,  67,  71,  72,  75,  75,
+          81,  54,  52,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,
+          60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  57,  55,  55,  53,
+          52,  52,  52,  52,  52,  52,  57,  57,  61,  62,  65,  67,  70,  74,
+          75,  79,  79,  85,  85,  89,  59,  56,  56,  54,  54,  54,  54,  54,
+          53,  54,  58,  58,  62,  64,  67,  69,  71,  75,  76,  80,  80,  86,
+          87,  90,  92,  62,  59,  59,  57,  56,  56,  56,  56,  55,  56,  60,
+          60,  64,  66,  69,  71,  73,  77,  78,  83,  83,  89,  89,  93,  95,
+          98,  65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,
+          68,  71,  73,  75,  79,  81,  85,  85,  91,  92,  96,  98,  101, 105,
+          67,  64,  64,  62,  61,  61,  60,  60,  59,  60,  64,  64,  68,  69,
+          72,  74,  77,  81,  82,  87,  87,  93,  94,  98,  99,  103, 106, 108,
+          71,  68,  68,  66,  65,  64,  64,  64,  63,  63,  68,  68,  72,  73,
+          76,  78,  80,  84,  85,  90,  90,  97,  97,  102, 103, 107, 111, 113,
+          117, 72,  69,  69,  66,  65,  65,  65,  64,  63,  64,  68,  68,  72,
+          73,  76,  78,  81,  85,  86,  91,  91,  97,  98,  102, 104, 108, 111,
+          113, 118, 119, 80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,
+          74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103, 104, 108, 110,
+          114, 118, 120, 125, 126, 134, 80,  76,  76,  73,  72,  72,  71,  70,
+          69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103,
+          104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+          33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+          40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+          43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+          42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+          47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+          50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+          53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+          49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+          58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+          57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+          55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+          50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+          47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+          50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+          63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+          55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+          49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+          70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+          59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+          51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+          72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+          57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+          57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+          68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+          53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+          76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+          57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+          85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+          62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+          89}},
+        {{32,  31,  31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32,  32,  32,
+          31,  32,  32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33,  31,  32,
+          32,  32,  32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33,  33,  34,
+          32,  32,  32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,  32,  32,
+          32,  33,  34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34,  35,  35,
+          36,  36,  38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37,  37,  39,
+          39,  34,  34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40,  41,  42,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          48,  38,  38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43,  44,  46,
+          50,  50,  52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40,  40,  44,
+          45,  47,  50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40,  40,  40,
+          41,  41,  45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42,  42,  41,
+          41,  42,  42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58,  60,  63,
+          44,  42,  42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47,  50,  54,
+          54,  57,  58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44,  45,  45,
+          45,  45,  49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,  48,  47,
+          46,  45,  44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57,  57,  60,
+          61,  63,  67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47,  47,  47,
+          47,  47,  51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72,  73,  75,
+          54,  52,  51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54,  56,  60,
+          60,  64,  65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51,  50,  49,
+          49,  49,  50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65,  67,  71,
+          71,  75,  76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53,  53,  53,
+          52,  52,  56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78,  79,  82,
+          86,  86,  90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53,  53,  57,
+          58,  60,  64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87,  87,  91,
+          92,  61,  59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59,  60,  62,
+          65,  65,  69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93,  94,  97,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          105, 70,  67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66,  67,  69,
+          72,  72,  76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+          109, 109, 114},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+          31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+          38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+          42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+          41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+          45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+          49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+          53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+          54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+          54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+          53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+          48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+          46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+          48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+          60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+          51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+          47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+          65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+          54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+          49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+          66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+          53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+          53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+          63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+          51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+          70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+          52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+          74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+          58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+          80}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+          34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+          37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+          39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+          50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+          45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+          40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+          41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+          42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+          58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+          48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+          44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+          66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+          53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+          45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+          68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+          54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+          51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+          65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+          51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+          76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+          53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+          87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+          58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+          92},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+          35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+          37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+          38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+          43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+          48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+          53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+          49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+          46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+          45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+          47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+          55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+          46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+          59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+          52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+          46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+          60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+          50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+          50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+          57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+          48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+          63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+          48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+          68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+          52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+          71}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+          36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+          34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+          34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+          41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+          42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+          40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+          36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+          38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+          40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+          38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+          55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+          47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+          41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+          58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+          43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+          45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+          56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+          45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+          63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+          45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+          70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+          50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+          77},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+          35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+          37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+          40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+          46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+          41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+          43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+          49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+          50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+          49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+          47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+          46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+          47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+          53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+          46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+          45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+          55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+          45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+          55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+          48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+          54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+          46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+          58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+          46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+          61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+          48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+          64}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+          37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+          35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+          34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+          35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+          42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+          34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+          48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+          39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+          37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+          40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+          38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+          45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+          39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+          52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+          42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+          58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+          42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+          63},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+          32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+          33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+          34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+          35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+          41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+          47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+          36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+          39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+          47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+          45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+          44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+          47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+          50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+          47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+          46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+          53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+          47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+          53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+          46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+          47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+          49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+          53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+          45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+          55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+          58}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+          34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+          34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+          35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+          39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+          35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+          41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+          35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+          38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+          34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+          42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+          35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+          48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+          37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+          50},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+          31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+          37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+          39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+          34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+          36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+          46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+          45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+          43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+          41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+          41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+          41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+          45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+          42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+          48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+          45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+          44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+          49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+          47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+          48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+          48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+          46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+          50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+          46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+          53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+          47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+          53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+          33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+          38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+          39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+          33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+          39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+          38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+          36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+          35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+          36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+          36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+          41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+          39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+          41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+          40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+          39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+          44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+          41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+          47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+          48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+          48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+          35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+          37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+          35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+          38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+          35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+          39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+          40}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+          30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32}}};
diff --git a/src/quantizer_test.cc b/src/quantizer_test.cc
new file mode 100644
index 0000000..618d247
--- /dev/null
+++ b/src/quantizer_test.cc
@@ -0,0 +1,168 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/quantizer.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(QuantizerTest, GetQIndex) {
+  const int kBaseQIndex = 40;
+  const int kDelta = 10;
+  const int kOutOfRangeIndex = 200;
+  Segmentation segmentation = {};
+
+  EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.enabled = true;
+  EXPECT_EQ(GetQIndex(segmentation, 0, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.feature_enabled[1][kSegmentFeatureQuantizer] = true;
+  segmentation.feature_data[1][kSegmentFeatureQuantizer] = kDelta;
+  EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex + kDelta);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+
+  segmentation.enabled = false;
+  EXPECT_EQ(GetQIndex(segmentation, 1, kBaseQIndex), kBaseQIndex);
+  EXPECT_EQ(GetQIndex(segmentation, kOutOfRangeIndex, kBaseQIndex),
+            kBaseQIndex);
+}
+
+TEST(QuantizerTest, GetDcValue) {
+  QuantizerParameters params = {};
+  params.delta_dc[kPlaneY] = 1;
+  params.delta_dc[kPlaneU] = 2;
+  params.delta_dc[kPlaneV] = 3;
+
+  // Test lookups of Dc_Qlookup[0][0], Dc_Qlookup[0][11], Dc_Qlookup[0][12],
+  // and Dc_Qlookup[0][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(8, &params);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 16);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 17);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 252), 1336);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 1336);
+  }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Test lookups of Dc_Qlookup[1][0], Dc_Qlookup[1][11], Dc_Qlookup[1][12],
+  // and Dc_Qlookup[1][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(10, &params);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 10), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 11), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneY, 255), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 9), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 10), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 253), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneU, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -4), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 8), 34);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 9), 37);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 254), 5347);
+    EXPECT_EQ(quantizer.GetDcValue(kPlaneV, 253), 5347);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+TEST(QuantizerTest, GetAcValue) {
+  QuantizerParameters params = {};
+  params.delta_ac[kPlaneU] = 1;
+  params.delta_ac[kPlaneV] = 2;
+
+  // Test lookups of Ac_Qlookup[0][0], Ac_Qlookup[0][11], Ac_Qlookup[0][12],
+  // and Ac_Qlookup[0][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(8, &params);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 18);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 19);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 1828);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 1828);
+  }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // Test lookups of Ac_Qlookup[1][0], Ac_Qlookup[1][11], Ac_Qlookup[1][12],
+  // and Ac_Qlookup[1][255] in the spec, including the clipping of qindex.
+  {
+    Quantizer quantizer(10, &params);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 0), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 11), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 12), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 255), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneY, 256), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, -1), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 10), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 11), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 254), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneU, 255), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -3), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, -2), 4);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 9), 37);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 10), 40);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 253), 7312);
+    EXPECT_EQ(quantizer.GetAcValue(kPlaneV, 254), 7312);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/reconstruction.cc b/src/reconstruction.cc
new file mode 100644
index 0000000..bf48137
--- /dev/null
+++ b/src/reconstruction.cc
@@ -0,0 +1,190 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+// Maps TransformType to dsp::Transform1d for the row transforms.
+constexpr dsp::Transform1d kRowTransform[kNumTransformTypes] = {
+    dsp::kTransform1dDct,      dsp::kTransform1dAdst,
+    dsp::kTransform1dDct,      dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst,     dsp::kTransform1dDct,
+    dsp::kTransform1dAdst,     dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst,     dsp::kTransform1dIdentity,
+    dsp::kTransform1dIdentity, dsp::kTransform1dDct,
+    dsp::kTransform1dIdentity, dsp::kTransform1dAdst,
+    dsp::kTransform1dIdentity, dsp::kTransform1dAdst};
+
+// Maps TransformType to dsp::Transform1d for the column transforms.
+constexpr dsp::Transform1d kColumnTransform[kNumTransformTypes] = {
+    dsp::kTransform1dDct,  dsp::kTransform1dDct,
+    dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+    dsp::kTransform1dDct,  dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst, dsp::kTransform1dAdst,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+    dsp::kTransform1dDct,  dsp::kTransform1dIdentity,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity,
+    dsp::kTransform1dAdst, dsp::kTransform1dIdentity};
+
+dsp::Transform1dSize GetTransform1dSize(int size_log2) {
+  return static_cast<dsp::Transform1dSize>(size_log2 - 2);
+}
+
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+  const TransformClass tx_class = GetTransformClass(tx_type);
+
+  switch (tx_class) {
+    case kTransformClass2D:
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 13) return 4;
+        if (non_zero_coeff_count <= 29) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+        if (non_zero_coeff_count <= 43) return 8;
+        if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+        if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+      }
+      break;
+
+    case kTransformClassHorizontal:
+      if (non_zero_coeff_count <= 4) return 4;
+      if (non_zero_coeff_count <= 8) return 8;
+      if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+      if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+      break;
+
+    default:
+      assert(tx_class == kTransformClassVertical);
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 16) return 4;
+        if (non_zero_coeff_count <= 32) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 32) return 4;
+        if (non_zero_coeff_count <= 64) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 8x8: 63, 8x16: 127.
+        if (non_zero_coeff_count <= 128) return 16;
+        if (non_zero_coeff_count <= 192) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 64) return 4;
+        if (non_zero_coeff_count <= 128) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 16x8: 127, 16x16: 255.
+        if (non_zero_coeff_count <= 256) return 16;
+        if (non_zero_coeff_count <= 384) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 128) return 4;
+        if (non_zero_coeff_count <= 256) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 32x8 is 255, 32x16 is 511.
+        if ((non_zero_coeff_count <= 512)) return 16;
+        if ((non_zero_coeff_count <= 768)) return 24;
+      }
+      break;
+  }
+  return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
+}  // namespace
+
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* const buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count) {
+  static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, "");
+  const int tx_width_log2 = kTransformWidthLog2[tx_size];
+  const int tx_height_log2 = kTransformHeightLog2[tx_size];
+
+  int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+  if (tx_height > 4) {
+    static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+                                          int non_zero_coeff_count) = {
+        &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+        &GetNumRows<32>};
+    tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+                                               non_zero_coeff_count);
+  }
+  assert(tx_height <= 32);
+
+  // Row transform.
+  const dsp::Transform1dSize row_transform_size =
+      GetTransform1dSize(tx_width_log2);
+  const dsp::Transform1d row_transform =
+      lossless ? dsp::kTransform1dWht : kRowTransform[tx_type];
+  const dsp::InverseTransformAddFunc row_transform_func =
+      dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
+  assert(row_transform_func != nullptr);
+
+  row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                     frame);
+
+  // Column transform.
+  const dsp::Transform1dSize column_transform_size =
+      GetTransform1dSize(tx_height_log2);
+  const dsp::Transform1d column_transform =
+      lossless ? dsp::kTransform1dWht : kColumnTransform[tx_type];
+  const dsp::InverseTransformAddFunc column_transform_func =
+      dsp.inverse_transforms[column_transform][column_transform_size]
+                            [dsp::kColumn];
+  assert(column_transform_func != nullptr);
+
+  column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                        frame);
+}
+
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int16_t* buffer,
+                          int start_x, int start_y, Array2DView<uint8_t>* frame,
+                          int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                          TransformSize tx_size, bool lossless, int32_t* buffer,
+                          int start_x, int start_y,
+                          Array2DView<uint16_t>* frame,
+                          int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
diff --git a/src/reconstruction.h b/src/reconstruction.h
new file mode 100644
index 0000000..6d5b115
--- /dev/null
+++ b/src/reconstruction.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_
+#define LIBGAV1_SRC_RECONSTRUCTION_H_
+
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Steps 2 and 3 of section 7.12.3 (contains the implementation of section
+// 7.13.3).
+// Apply the inverse transforms and add the residual to the frame for the
+// transform block size |tx_size| starting at position |start_x| and |start_y|.
+template <typename Residual, typename Pixel>
+void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                 TransformSize tx_size, bool lossless, Residual* buffer,
+                 int start_x, int start_y, Array2DView<Pixel>* frame,
+                 int non_zero_coeff_count);
+
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int16_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint8_t>* frame,
+                                 int non_zero_coeff_count);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
+                                 TransformSize tx_size, bool lossless,
+                                 int32_t* buffer, int start_x, int start_y,
+                                 Array2DView<uint16_t>* frame,
+                                 int non_zero_coeff_count);
+#endif
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_RECONSTRUCTION_H_
diff --git a/src/reconstruction_test.cc b/src/reconstruction_test.cc
new file mode 100644
index 0000000..fd780b3
--- /dev/null
+++ b/src/reconstruction_test.cc
@@ -0,0 +1,294 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/reconstruction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+#include "src/utils/memory.h"
+#include "tests/block_utils.h"
+#include "tests/utils.h"
+
+namespace libgav1 {
+namespace {
+
+// Import the scan tables in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+constexpr int kTestTransformSize = 4;
+constexpr int8_t kTestBitdepth = 8;
+
+using testing::ElementsAreArray;
+
+// The 'int' parameter is unused but required to allow for instantiations of C,
+// NEON, etc.
+class ReconstructionTest : public testing::TestWithParam<int> {
+ public:
+  ReconstructionTest() = default;
+  ReconstructionTest(const ReconstructionTest&) = delete;
+  ReconstructionTest& operator=(const ReconstructionTest&) = delete;
+  ~ReconstructionTest() override = default;
+
+ protected:
+  void SetUp() override {
+    test_utils::ResetDspTable(kTestBitdepth);
+    dsp::InverseTransformInit_C();
+    dsp_ = dsp::GetDspTable(kTestBitdepth);
+    ASSERT_NE(dsp_, nullptr);
+    const testing::TestInfo* const test_info =
+        testing::UnitTest::GetInstance()->current_test_info();
+    if (test_info->value_param() != nullptr) {
+      const char* const test_case = test_info->test_suite_name();
+      if (absl::StartsWith(test_case, "C/")) {
+      } else if (absl::StartsWith(test_case, "SSE41/")) {
+        if ((GetCpuInfo() & kSSE4_1) != 0) {
+          dsp::InverseTransformInit_SSE4_1();
+        }
+      } else if (absl::StartsWith(test_case, "NEON/")) {
+        dsp::InverseTransformInit_NEON();
+      } else {
+        FAIL() << "Unrecognized architecture prefix in test case name: "
+               << test_case;
+      }
+    }
+    InitBuffers();
+  }
+
+  void InitBuffers(int width = kTestTransformSize,
+                   int height = kTestTransformSize) {
+    const int size = width * height;
+    buffer_.clear();
+    buffer_.resize(size);
+    residual_buffer_.clear();
+    residual_buffer_.resize(size);
+    for (int i = 0; i < size; ++i) {
+      buffer_[i] = residual_buffer_[i] = i % 256;
+    }
+    frame_buffer_.Reset(height, width, buffer_.data());
+  }
+
+  template <int bitdepth>
+  void TestWht();
+
+  std::vector<uint8_t> buffer_;
+  std::vector<int16_t> residual_buffer_;
+  // |frame_buffer_| is just a 2D array view into the |buffer_|.
+  Array2DView<uint8_t> frame_buffer_;
+  const dsp::Dsp* dsp_;
+};
+
+template <int bitdepth>
+void ReconstructionTest::TestWht() {
+  static_assert(bitdepth == kBitdepth8 || bitdepth == kBitdepth10, "");
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dWht][dsp::kTransform1dSize4]) {
+    if (transform == nullptr) {
+      GTEST_SKIP() << "No function available for dsp::kTransform1dWht";
+    }
+  }
+  constexpr int max = 16 << bitdepth;
+  constexpr int min = -max;
+  static constexpr int16_t residual_inputs[][16]{
+      {64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {69, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, max - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, min - 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+      // Note these are unrealistic inputs, but serve to test each position in
+      // the array and match extremes in some commercial test vectors.
+      {max, max, max, max, max, max, max, max, max, max, max, max, max, max,
+       max, max},
+      {min, min, min, min, min, min, min, min, min, min, min, min, min, min,
+       min, min}};
+  // Before the Reconstruct() call, the frame buffer is filled with all 127.
+  // After the Reconstruct() call, the frame buffer is expected to have the
+  // following values.
+  static constexpr uint8_t frame_outputs[][16]{
+      {131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+       131, 131},
+      {132, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
+       131, 131},
+      {255, 255, 0, 0, 255, 255, 0, 0, 0, 0, 255, 255, 0, 0, 255, 255},
+      {0, 0, 255, 255, 0, 0, 255, 255, 255, 255, 0, 0, 255, 255, 0, 0},
+      {255, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+       127, 127},
+      {0, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+       127},
+  };
+
+  const TransformSize tx_size = kTransformSize4x4;
+  const TransformType tx_type = kTransformTypeDctDct;
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const uint16_t* const scan = kScan[GetTransformClass(tx_type)][tx_size];
+
+  InitBuffers(tx_width, tx_height);
+
+  const int num_tests = sizeof(residual_inputs) / sizeof(residual_inputs[0]);
+  for (int i = 0; i < num_tests; ++i) {
+    int16_t eob;  // Also known as non_zero_coeff_count.
+    for (eob = 15; eob >= 0; --eob) {
+      if (residual_inputs[i][scan[eob]] != 0) break;
+    }
+    ++eob;
+    memcpy(residual_buffer_.data(), residual_inputs[i],
+           sizeof(residual_inputs[i]));
+    memset(buffer_.data(), 127, sizeof(frame_outputs[i]));
+    Reconstruct(*dsp_, tx_type, tx_size, /*lossless=*/true,
+                residual_buffer_.data(), 0, 0, &frame_buffer_, eob);
+
+    EXPECT_TRUE(test_utils::CompareBlocks(buffer_.data(), frame_outputs[i],
+                                          tx_width, tx_height, tx_width,
+                                          tx_width, false, true))
+        << "Mismatch WHT test case " << i;
+  }
+}
+
+TEST_P(ReconstructionTest, ReconstructionSimple) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_output_buffer[] = {
+      0, 1, 2, 3,
+      5, 6, 7, 8,
+      9, 10, 11, 12,
+      14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_output_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipY) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityFlipadst, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 7,
+      7, 8, 9, 10,
+      14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipX) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeFlipadstIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 8,
+      8, 10, 10, 13,
+      12, 14, 14, 18
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionFlipXAndFlipY) {
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeFlipadstFlipadst, kTransformSize4x4, false,
+              residual_buffer_.data(), 0, 0, &frame_buffer_, 16);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 1, 2, 3,
+      4, 5, 6, 8,
+      8, 8, 10, 9,
+      12, 14, 14, 19
+  };
+  // clang-format on
+  EXPECT_THAT(buffer_, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, ReconstructionNonZeroStart) {
+  uint8_t buffer[64] = {};
+  Array2DView<uint8_t> frame_buffer(8, 8, buffer);
+  int k = 0;
+  for (int i = 0; i < kTestTransformSize; ++i) {
+    for (int j = 0; j < kTestTransformSize; ++j) {
+      frame_buffer[i + 4][j + 4] = k++;
+    }
+  }
+  for (const auto transform :
+       dsp_->inverse_transforms[dsp::kTransform1dIdentity]
+                               [dsp::kTransform1dSize4]) {
+    if (transform == nullptr) GTEST_SKIP();
+  }
+  Reconstruct(*dsp_, kTransformTypeIdentityIdentity, kTransformSize4x4, false,
+              residual_buffer_.data(), 4, 4, &frame_buffer, 64);
+  // clang-format off
+  static constexpr uint8_t expected_buffer[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 1, 2, 3,
+      0, 0, 0, 0, 5, 6, 7, 8,
+      0, 0, 0, 0, 9, 10, 11, 12,
+      0, 0, 0, 0, 14, 15, 16, 17
+  };
+  // clang-format on
+  EXPECT_THAT(buffer, ElementsAreArray(expected_buffer));
+}
+
+TEST_P(ReconstructionTest, Wht8bit) { TestWht<kBitdepth8>(); }
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+TEST_P(ReconstructionTest, Wht10bit) { TestWht<kBitdepth10>(); }
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, ReconstructionTest, testing::Values(0));
+
+#if LIBGAV1_ENABLE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE41, ReconstructionTest, testing::Values(0));
+#endif
+
+#if LIBGAV1_ENABLE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ReconstructionTest, testing::Values(0));
+#endif
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc
new file mode 100644
index 0000000..44a842c
--- /dev/null
+++ b/src/residual_buffer_pool.cc
@@ -0,0 +1,143 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <utility>
+
+namespace libgav1 {
+namespace {
+
+// The maximum queue size is derived using the following formula:
+//   ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)).
+// Where:
+//   sb_size is the superblock size (64 or 128).
+//   16 is 4*4 which is kMinTransformWidth * kMinTransformHeight.
+//   x is subsampling_x + 1.
+//   y is subsampling_y + 1.
+// The first component is for the Y plane and the second component is for the U
+// and V planes.
+// For example, for 128x128 superblocks with 422 subsampling the size is:
+//   ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048.
+//
+// First dimension: use_128x128_superblock.
+// Second dimension: subsampling_x.
+// Third dimension: subsampling_y.
+constexpr int kMaxQueueSize[2][2][2] = {
+    // 64x64 superblocks.
+    {
+        {768, 512},
+        {512, 384},
+    },
+    // 128x128 superblocks.
+    {
+        {3072, 2048},
+        {2048, 1536},
+    },
+};
+
+}  // namespace
+
+ResidualBufferStack::~ResidualBufferStack() {
+  while (top_ != nullptr) {
+    ResidualBuffer* top = top_;
+    top_ = top_->next_;
+    delete top;
+  }
+}
+
+void ResidualBufferStack::Push(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->next_ = top_;
+  top_ = buffer.release();
+  ++num_buffers_;
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferStack::Pop() {
+  std::unique_ptr<ResidualBuffer> top;
+  if (top_ != nullptr) {
+    top.reset(top_);
+    top_ = top_->next_;
+    top->next_ = nullptr;
+    --num_buffers_;
+  }
+  return top;
+}
+
+void ResidualBufferStack::Swap(ResidualBufferStack* other) {
+  std::swap(top_, other->top_);
+  std::swap(num_buffers_, other->num_buffers_);
+}
+
+ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock,
+                                       int subsampling_x, int subsampling_y,
+                                       size_t residual_size)
+    : buffer_size_(GetResidualBufferSize(
+          use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+          subsampling_x, subsampling_y, residual_size)),
+      queue_size_(kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                               [subsampling_x][subsampling_y]) {}
+
+void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x,
+                               int subsampling_y, size_t residual_size) {
+  const size_t buffer_size = GetResidualBufferSize(
+      use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64,
+      subsampling_x, subsampling_y, residual_size);
+  const int queue_size = kMaxQueueSize[static_cast<int>(use_128x128_superblock)]
+                                      [subsampling_x][subsampling_y];
+  if (buffer_size == buffer_size_ && queue_size == queue_size_) {
+    // The existing buffers (if any) are still valid, so don't do anything.
+    return;
+  }
+  buffer_size_ = buffer_size;
+  queue_size_ = queue_size;
+  // The existing buffers (if any) are no longer valid since the buffer size or
+  // the queue size has changed. Clear the stack.
+  ResidualBufferStack buffers;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Move the buffers in the stack to the local variable |buffers| and clear
+    // the stack.
+    buffers.Swap(&buffers_);
+    // Release mutex_ before freeing the buffers.
+  }
+  // As the local variable |buffers| goes out of scope, its destructor frees
+  // the buffers that were in the stack.
+}
+
+std::unique_ptr<ResidualBuffer> ResidualBufferPool::Get() {
+  std::unique_ptr<ResidualBuffer> buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffer = buffers_.Pop();
+  }
+  if (buffer == nullptr) {
+    buffer = ResidualBuffer::Create(buffer_size_, queue_size_);
+  }
+  return buffer;
+}
+
+void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
+  buffer->transform_parameters()->Clear();
+  buffer->partition_tree_order()->Clear();
+  std::lock_guard<std::mutex> lock(mutex_);
+  buffers_.Push(std::move(buffer));
+}
+
+size_t ResidualBufferPool::Size() const {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return buffers_.Size();
+}
+
+}  // namespace libgav1
diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h
new file mode 100644
index 0000000..75924db
--- /dev/null
+++ b/src/residual_buffer_pool.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This class is used for parsing and decoding a superblock. Members of this
+// class are populated in the "parse" step and consumed in the "decode" step.
+class ResidualBuffer : public Allocable {
+ public:
+  static std::unique_ptr<ResidualBuffer> Create(size_t buffer_size,
+                                                int queue_size) {
+    std::unique_ptr<ResidualBuffer> buffer(new (std::nothrow) ResidualBuffer);
+    if (buffer != nullptr) {
+      buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
+      if (buffer->buffer_ == nullptr ||
+          !buffer->transform_parameters_.Init(queue_size) ||
+          !buffer->partition_tree_order_.Init(queue_size)) {
+        buffer = nullptr;
+      }
+    }
+    return buffer;
+  }
+
+  // Move only.
+  ResidualBuffer(ResidualBuffer&& other) = default;
+  ResidualBuffer& operator=(ResidualBuffer&& other) = default;
+
+  // Buffer used to store the residual values.
+  uint8_t* buffer() { return buffer_.get(); }
+  // Queue used to store the transform parameters.
+  Queue<TransformParameters>* transform_parameters() {
+    return &transform_parameters_;
+  }
+  // Queue used to store the block ordering in the partition tree of the
+  // superblocks.
+  Queue<PartitionTreeNode>* partition_tree_order() {
+    return &partition_tree_order_;
+  }
+
+ private:
+  friend class ResidualBufferStack;
+
+  ResidualBuffer() = default;
+
+  AlignedUniquePtr<uint8_t> buffer_;
+  Queue<TransformParameters> transform_parameters_;
+  Queue<PartitionTreeNode> partition_tree_order_;
+  // Used by ResidualBufferStack to form a chain of ResidualBuffers.
+  ResidualBuffer* next_ = nullptr;
+};
+
+// A LIFO stack of ResidualBuffers. Owns the buffers in the stack.
+class ResidualBufferStack {
+ public:
+  ResidualBufferStack() = default;
+
+  // Not copyable or movable
+  ResidualBufferStack(const ResidualBufferStack&) = delete;
+  ResidualBufferStack& operator=(const ResidualBufferStack&) = delete;
+
+  ~ResidualBufferStack();
+
+  // Pushes |buffer| to the top of the stack.
+  void Push(std::unique_ptr<ResidualBuffer> buffer);
+
+  // If the stack is non-empty, returns the buffer at the top of the stack and
+  // removes it from the stack. If the stack is empty, returns nullptr.
+  std::unique_ptr<ResidualBuffer> Pop();
+
+  // Swaps the contents of this stack and |other|.
+  void Swap(ResidualBufferStack* other);
+
+  // Returns the number of buffers in the stack.
+  size_t Size() const { return num_buffers_; }
+
+ private:
+  // A singly-linked list of ResidualBuffers, chained together using the next_
+  // field of ResidualBuffer.
+  ResidualBuffer* top_ = nullptr;
+  size_t num_buffers_ = 0;
+};
+
+// Utility class used to manage the residual buffers (and the transform
+// parameters) used for multi-threaded decoding. This class uses a stack to
+// store the buffers for better cache locality. Since buffers used more recently
+// are more likely to be in the cache. All functions in this class are
+// thread-safe.
+class ResidualBufferPool : public Allocable {
+ public:
+  ResidualBufferPool(bool use_128x128_superblock, int subsampling_x,
+                     int subsampling_y, size_t residual_size);
+
+  // Recomputes |buffer_size_| and invalidates the existing buffers if
+  // necessary.
+  void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y,
+             size_t residual_size);
+  // Gets a residual buffer. The buffer is guaranteed to be large enough to
+  // store the residual values for one superblock whose parameters are the same
+  // as the constructor or the last call to Reset(). If there are free buffers
+  // in the stack, it returns one from the stack, otherwise a new buffer is
+  // allocated.
+  std::unique_ptr<ResidualBuffer> Get();
+  // Returns the |buffer| back to the pool (by appending it to the stack).
+  // Subsequent calls to Get() may re-use this buffer.
+  void Release(std::unique_ptr<ResidualBuffer> buffer);
+
+  // Used only in the tests. Returns the number of buffers in the stack.
+  size_t Size() const;
+
+ private:
+  mutable std::mutex mutex_;
+  ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  size_t buffer_size_;
+  int queue_size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_
diff --git a/src/residual_buffer_pool_test.cc b/src/residual_buffer_pool_test.cc
new file mode 100644
index 0000000..84bc747
--- /dev/null
+++ b/src/residual_buffer_pool_test.cc
@@ -0,0 +1,201 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/residual_buffer_pool.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/queue.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(ResidualBufferTest, TestUsage) {
+  ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 0);
+  // Get one buffer.
+  std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+  uint8_t* const buffer1_ptr = buffer1->buffer();
+  ASSERT_NE(buffer1_ptr, nullptr);
+  // Get another buffer (while holding on to the first one).
+  std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+  uint8_t* const buffer2_ptr = buffer2->buffer();
+  ASSERT_NE(buffer2_ptr, nullptr);
+  EXPECT_NE(buffer1_ptr, buffer2_ptr);
+  // Return the second buffer.
+  pool.Release(std::move(buffer2));
+  EXPECT_EQ(pool.Size(), 1);
+  // Get another buffer (this one should be the same as the buffer2).
+  std::unique_ptr<ResidualBuffer> buffer3 = pool.Get();
+  uint8_t* const buffer3_ptr = buffer3->buffer();
+  ASSERT_NE(buffer3_ptr, nullptr);
+  EXPECT_EQ(buffer3_ptr, buffer2_ptr);
+  EXPECT_EQ(pool.Size(), 0);
+  // Get another buffer (this one will be a new buffer).
+  std::unique_ptr<ResidualBuffer> buffer4 = pool.Get();
+  uint8_t* const buffer4_ptr = buffer4->buffer();
+  ASSERT_NE(buffer4_ptr, nullptr);
+  EXPECT_NE(buffer4_ptr, buffer1_ptr);
+  EXPECT_NE(buffer4_ptr, buffer3_ptr);
+  EXPECT_EQ(pool.Size(), 0);
+  // Return all the buffers.
+  pool.Release(std::move(buffer1));
+  EXPECT_EQ(pool.Size(), 1);
+  pool.Release(std::move(buffer3));
+  EXPECT_EQ(pool.Size(), 2);
+  pool.Release(std::move(buffer4));
+  EXPECT_EQ(pool.Size(), 3);
+  // Reset the buffer with same parameters.
+  pool.Reset(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 3);
+  // Reset the buffer size with different parameters.
+  pool.Reset(true, 0, 1, sizeof(int32_t));
+  // The existing buffers should now have been invalidated.
+  EXPECT_EQ(pool.Size(), 0);
+  // Get and return a buffer.
+  std::unique_ptr<ResidualBuffer> buffer5 = pool.Get();
+  uint8_t* const buffer5_ptr = buffer5->buffer();
+  ASSERT_NE(buffer5_ptr, nullptr);
+  pool.Release(std::move(buffer5));
+  EXPECT_EQ(pool.Size(), 1);
+  // Reset the buffer with different value for use128x128_superblock.
+  pool.Reset(false, 0, 1, sizeof(int32_t));
+  // The existing buffers should now have been invalidated.
+  EXPECT_EQ(pool.Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestQueue) {
+  ResidualBufferPool pool(true, 1, 1, sizeof(int16_t));
+  EXPECT_EQ(pool.Size(), 0);
+  // Get one buffer.
+  std::unique_ptr<ResidualBuffer> buffer1 = pool.Get();
+  uint8_t* const buffer1_ptr = buffer1->buffer();
+  ASSERT_NE(buffer1_ptr, nullptr);
+  auto* queue1 = buffer1->transform_parameters();
+  queue1->Push(TransformParameters(kTransformTypeAdstAdst, 10));
+  EXPECT_EQ(queue1->Size(), 1);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+  queue1->Push(TransformParameters(kTransformTypeDctDct, 20));
+  EXPECT_EQ(queue1->Size(), 2);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeAdstAdst);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 10);
+  queue1->Pop();
+  EXPECT_EQ(queue1->Size(), 1);
+  EXPECT_EQ(queue1->Front().type, kTransformTypeDctDct);
+  EXPECT_EQ(queue1->Front().non_zero_coeff_count, 20);
+  // Return the buffer.
+  pool.Release(std::move(buffer1));
+  EXPECT_EQ(pool.Size(), 1);
+  // Get another buffer (should be the same as buffer1).
+  std::unique_ptr<ResidualBuffer> buffer2 = pool.Get();
+  uint8_t* const buffer2_ptr = buffer2->buffer();
+  ASSERT_NE(buffer2_ptr, nullptr);
+  EXPECT_EQ(buffer1_ptr, buffer2_ptr);
+  // Releasing the buffer should've cleared the queue.
+  EXPECT_EQ(buffer2->transform_parameters()->Size(), 0);
+}
+
+TEST(ResidualBufferTest, TestStackPushPop) {
+  ResidualBufferStack buffers;
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(buffers.Pop(), nullptr);
+
+  std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer0_ptr = buffer0.get();
+  EXPECT_NE(buffer0_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer1_ptr = buffer1.get();
+  EXPECT_NE(buffer1_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer2_ptr = buffer2.get();
+  EXPECT_NE(buffer2_ptr, nullptr);
+
+  // Push two buffers onto the stack.
+  buffers.Push(std::move(buffer0));
+  EXPECT_EQ(buffers.Size(), 1);
+  buffers.Push(std::move(buffer1));
+  EXPECT_EQ(buffers.Size(), 2);
+
+  // Pop one buffer off the stack.
+  std::unique_ptr<ResidualBuffer> top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 1);
+  EXPECT_EQ(top.get(), buffer1_ptr);
+
+  // Push one buffer onto the stack.
+  buffers.Push(std::move(buffer2));
+  EXPECT_EQ(buffers.Size(), 2);
+
+  // Pop two buffers off the stack
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 1);
+  EXPECT_EQ(top.get(), buffer2_ptr);
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(top.get(), buffer0_ptr);
+
+  // Try to pop a buffer off an empty stack.
+  top = buffers.Pop();
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(top, nullptr);
+}
+
+TEST(ResidualBufferTest, TestStackSwap) {
+  ResidualBufferStack buffers;
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(buffers.Pop(), nullptr);
+
+  std::unique_ptr<ResidualBuffer> buffer0 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer0_ptr = buffer0.get();
+  EXPECT_NE(buffer0_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer1 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer1_ptr = buffer1.get();
+  EXPECT_NE(buffer1_ptr, nullptr);
+  std::unique_ptr<ResidualBuffer> buffer2 = ResidualBuffer::Create(128, 128);
+  ResidualBuffer* const buffer2_ptr = buffer2.get();
+  EXPECT_NE(buffer2_ptr, nullptr);
+
+  // Push three buffers onto the stack.
+  buffers.Push(std::move(buffer0));
+  EXPECT_EQ(buffers.Size(), 1);
+  buffers.Push(std::move(buffer1));
+  EXPECT_EQ(buffers.Size(), 2);
+  buffers.Push(std::move(buffer2));
+  EXPECT_EQ(buffers.Size(), 3);
+
+  // Swap the contents of the stacks.
+  ResidualBufferStack swapped;
+  swapped.Swap(&buffers);
+  EXPECT_EQ(buffers.Size(), 0);
+  EXPECT_EQ(swapped.Size(), 3);
+
+  // Pop three buffers off the swapped stack.
+  std::unique_ptr<ResidualBuffer> top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 2);
+  EXPECT_EQ(top.get(), buffer2_ptr);
+  top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 1);
+  EXPECT_EQ(top.get(), buffer1_ptr);
+  top = swapped.Pop();
+  EXPECT_EQ(swapped.Size(), 0);
+  EXPECT_EQ(top.get(), buffer0_ptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/scan_tables.inc b/src/scan_tables.inc
new file mode 100644
index 0000000..f7c9231
--- /dev/null
+++ b/src/scan_tables.inc
@@ -0,0 +1,440 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains all the scan order tables.
+
+constexpr uint16_t kDefaultScan4x4[16] = {0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15};
+
+constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8,  12, 1, 5, 9,  13,
+                                         2, 6, 10, 14, 3, 7, 11, 15};
+
+constexpr uint16_t kRowScan4x4[16] = {0, 1, 2,  3,  4,  5,  6,  7,
+                                      8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr uint16_t kDefaultScan4x8[32] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31};
+
+constexpr uint16_t kColumnScan4x8[32] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+    2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31};
+
+constexpr uint16_t kRowScan4x8[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x4[32] = {
+    0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+    12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31};
+
+constexpr uint16_t kColumnScan8x4[32] = {
+    0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+    4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31};
+
+constexpr uint16_t kRowScan8x4[32] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+constexpr uint16_t kDefaultScan8x8[64] = {
+    0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63};
+
+constexpr uint16_t kColumnScan8x8[64] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+    2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+    4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+    6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63};
+
+constexpr uint16_t kRowScan8x8[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x16[128] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+    117, 124, 111, 118, 125, 119, 126, 127};
+
+constexpr uint16_t kColumnScan8x16[128] = {
+    0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+    1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+    2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+    3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+    4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+    5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+    6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+    7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127};
+
+constexpr uint16_t kRowScan8x16[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x8[128] = {
+    0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+    65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+    52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+    54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+    56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+    58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+    60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+    47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127};
+
+constexpr uint16_t kColumnScan16x8[128] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+    2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+    4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+    6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+    8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+    10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+    12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+    14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127};
+
+constexpr uint16_t kRowScan16x8[128] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127};
+
+constexpr uint16_t kDefaultScan16x16[256] = {
+    0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+    5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+    37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+    85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+    12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+    31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+    243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+    109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+    170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+    218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+    255};
+
+constexpr uint16_t kColumnScan16x16[256] = {
+    0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+    1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+    2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+    3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+    4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+    5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+    6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+    7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+    8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+    9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+    10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+    11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+    12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+    13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+    14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+    15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255};
+
+constexpr uint16_t kRowScan16x16[256] = {
+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+    135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+    150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+    180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+    195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+    210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+    225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+    255};
+
+constexpr uint16_t kDefaultScan16x32[512] = {
+    0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+    5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+    37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+    9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+    85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+    146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+    192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+    14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+    15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+    240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+    241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+    242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+    243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+    244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+    245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+    246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+    247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+    248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+    249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+    250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+    251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+    252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+    253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+    254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+    255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+    480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+    481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+    482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+    498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+    350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+    411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+    487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+    459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+    491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+    510, 511};
+
+constexpr uint16_t kDefaultScan32x16[512] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+    288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+    165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+    73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+    12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+    448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+    480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+    15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+    47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+    79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+    111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+    143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+    175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+    207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+    239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+    271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+    303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+    335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+    367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+    399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+    431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+    463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+    495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+    30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+    62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+    94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+    95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+    469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+    377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+    254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+    380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+    382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+    479, 511};
+
+constexpr uint16_t kDefaultScan32x32[1024] = {
+    0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+    35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+    37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+    163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+    226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+    10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+    384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+    13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+    416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+    76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+    325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+    295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+    110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+    513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+    235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+    174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+    577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+    299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+    114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+    517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+    487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+    84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+    333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+    736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+    396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+    25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+    428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+    832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+    429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+    26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+    399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+    802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+    586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+    183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+    246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+    649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+    867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+    464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+    61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+    372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+    775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+    838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+    435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+    95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+    498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+    901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+    716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+    313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+    345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+    748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+    873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+    470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+    316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+    719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+    906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+    503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+    411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+    814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+    815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+    412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+    630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+    1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+    600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+    570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+    973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+    664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+    634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+    1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+    604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+    822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+    823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+    731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+    918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+    764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+    889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+    921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+    799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+    862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+    927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023};
+
+constexpr uint16_t kDefaultScan4x16[64] = {
+    0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+    17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+    33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+    49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63};
+
+constexpr uint16_t kColumnScan4x16[64] = {
+    0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+    1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+    2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+    3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63};
+
+constexpr uint16_t kRowScan4x16[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan16x4[64] = {
+    0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+    20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+    24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+    28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63};
+
+constexpr uint16_t kColumnScan16x4[64] = {
+    0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+    4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+    8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+    12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63};
+
+constexpr uint16_t kRowScan16x4[64] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
+
+constexpr uint16_t kDefaultScan8x32[256] = {
+    0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+    5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+    21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+    30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+    39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+    96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+    105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+    114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+    123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+    132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+    141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+    150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+    159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+    216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+    225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+    234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+    250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+    255};
+
+constexpr uint16_t kDefaultScan32x8[256] = {
+    0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+    160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+    162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+    195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+    228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+    12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+    45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+    78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+    111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+    144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+    177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+    210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+    243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+    27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+    60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+    93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+    95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+    255};
+
+// 5.11.41 (implemented as a simple look up of transform class and transform
+// size).
+const uint16_t* kScan[3][kNumTransformSizes] = {
+    // kTransformClass2D
+    {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4,
+     kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4,
+     kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32,
+     kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassHorizontal
+    {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4,
+     kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4,
+     kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32,
+     kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32,
+     kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32},
+    // kTransformClassVertical
+    {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8,
+     kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16,
+     kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4,
+     kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32,
+     kDefaultScan32x32}};
diff --git a/src/scan_test.cc b/src/scan_test.cc
new file mode 100644
index 0000000..065ca03
--- /dev/null
+++ b/src/scan_test.cc
@@ -0,0 +1,85 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+class ScanOrderTest
+    : public testing::TestWithParam<std::tuple<TransformClass, TransformSize>> {
+ public:
+  ScanOrderTest() = default;
+  ScanOrderTest(const ScanOrderTest&) = delete;
+  ScanOrderTest& operator=(const ScanOrderTest&) = delete;
+  ~ScanOrderTest() override = default;
+
+ protected:
+  TransformClass tx_class_ = std::get<0>(GetParam());
+  TransformSize tx_size_ = std::get<1>(GetParam());
+};
+
+TEST_P(ScanOrderTest, AllIndicesAreScannedExactlyOnce) {
+  const int tx_width = kTransformWidth[tx_size_];
+  const int tx_height = kTransformHeight[tx_size_];
+  int num_indices;
+  if (tx_class_ == kTransformClass2D || std::max(tx_width, tx_height) == 64) {
+    const int clamped_tx_width = std::min(32, tx_width);
+    const int clamped_tx_height = std::min(32, tx_height);
+    num_indices = clamped_tx_width * clamped_tx_height;
+  } else {
+    num_indices =
+        (std::max(tx_width, tx_height) > 16) ? 64 : tx_width * tx_height;
+  }
+  const uint16_t* const scan = kScan[tx_class_][tx_size_];
+  ASSERT_NE(scan, nullptr);
+  // Ensure that all the indices are scanned exactly once.
+  std::vector<int> scanned;
+  scanned.resize(num_indices);
+  for (int i = 0; i < num_indices; ++i) {
+    scanned[scan[i]]++;
+  }
+  EXPECT_THAT(scanned, testing::Each(1));
+}
+
+constexpr TransformClass kTestTransformClasses[] = {
+    kTransformClass2D, kTransformClassVertical, kTransformClassHorizontal};
+
+constexpr TransformSize kTestTransformSizes[] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64};
+
+INSTANTIATE_TEST_SUITE_P(
+    C, ScanOrderTest,
+    testing::Combine(testing::ValuesIn(kTestTransformClasses),
+                     testing::ValuesIn(kTestTransformSizes)));
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/status_code.cc b/src/status_code.cc
new file mode 100644
index 0000000..34def08
--- /dev/null
+++ b/src/status_code.cc
@@ -0,0 +1,57 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/status_code.h"
+
+extern "C" {
+
+const char* Libgav1GetErrorString(Libgav1StatusCode status) {
+  switch (status) {
+    case kLibgav1StatusOk:
+      return "Success.";
+    case kLibgav1StatusUnknownError:
+      return "Unknown error.";
+    case kLibgav1StatusInvalidArgument:
+      return "Invalid function argument.";
+    case kLibgav1StatusOutOfMemory:
+      return "Memory allocation failure.";
+    case kLibgav1StatusResourceExhausted:
+      return "Ran out of a resource (other than memory).";
+    case kLibgav1StatusNotInitialized:
+      return "The object is not initialized.";
+    case kLibgav1StatusAlready:
+      return "An operation that can only be performed once has already been "
+             "performed.";
+    case kLibgav1StatusUnimplemented:
+      return "Not implemented.";
+    case kLibgav1StatusInternalError:
+      return "Internal error in libgav1.";
+    case kLibgav1StatusBitstreamError:
+      return "The bitstream is not encoded correctly or violates a bitstream "
+             "conformance requirement.";
+    case kLibgav1StatusTryAgain:
+      return "The operation is not allowed at the moment. Try again later.";
+    case kLibgav1StatusNothingToDequeue:
+      return "There are no enqueued frames, so there is nothing to dequeue. "
+             "Try enqueuing a frame before trying to dequeue again.";
+    // This switch statement does not have a default case. This way the compiler
+    // will warn if we neglect to update this function after adding a new value
+    // to the Libgav1StatusCode enum type.
+    case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_:
+      break;
+  }
+  return "Unrecognized status code.";
+}
+
+}  // extern "C"
diff --git a/src/symbol_decoder_context.cc b/src/symbol_decoder_context.cc
new file mode 100644
index 0000000..26a281e
--- /dev/null
+++ b/src/symbol_decoder_context.cc
@@ -0,0 +1,322 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cassert>
+#include <cstring>
+#include <type_traits>
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/symbol_decoder_context_cdfs.inc"
+
+uint8_t GetQuantizerContext(int base_quantizer_index) {
+  if (base_quantizer_index <= 20) return 0;
+  if (base_quantizer_index <= 60) return 1;
+  if (base_quantizer_index <= 120) return 2;
+  return 3;
+}
+
+// Reset*Counters() are helper functions to reset the CDF arrays where the
+// counters are not in the last element of the innermost dimension.
+
+void ResetPartitionCounters(SymbolDecoderContext* const context) {
+  int block_size_log2 = k4x4WidthLog2[kBlock8x8];
+  for (auto& d1 : context->partition_cdf) {
+    const int cdf_size =
+        SymbolDecoderContext::PartitionCdfSize(block_size_log2++);
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) {
+  for (auto& d1 : context->palette_color_index_cdf) {
+    int cdf_size = kMinPaletteSize;
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+      ++cdf_size;
+    }
+  }
+}
+
+void ResetTxTypeCounters(SymbolDecoderContext* const context) {
+  int set_index = kTransformSetIntra1;
+  for (auto& d1 : context->intra_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      for (auto& d3 : d2) {
+        d3[cdf_size] = 0;
+      }
+    }
+  }
+  for (auto& d1 : context->inter_tx_type_cdf) {
+    const int cdf_size = kNumTransformTypesInSet[set_index++];
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetTxDepthCounters(SymbolDecoderContext* const context) {
+  int delta = 1;
+  for (auto& d1 : context->tx_depth_cdf) {
+    const int cdf_size = kMaxTxDepthSymbolCount - delta;
+    delta = 0;
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+  }
+}
+
+void ResetUVModeCounters(SymbolDecoderContext* const context) {
+  int cdf_size = kIntraPredictionModesUV - 1;
+  for (auto& d1 : context->uv_mode_cdf) {
+    for (auto& d2 : d1) {
+      d2[cdf_size] = 0;
+    }
+    ++cdf_size;
+  }
+}
+
+}  // namespace
+
+#define CDF_COPY(source, destination)                       \
+  static_assert(sizeof(source) == sizeof(destination), ""); \
+  memcpy(destination, source, sizeof(source))
+
+void SymbolDecoderContext::Initialize(int base_quantizer_index) {
+  CDF_COPY(kDefaultPartitionCdf, partition_cdf);
+  CDF_COPY(kDefaultSkipCdf, skip_cdf);
+  CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf);
+  CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf);
+  CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf);
+  CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf);
+  for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) {
+    CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry);
+  }
+  CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf);
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+  CDF_COPY(kDefaultYModeCdf, y_mode_cdf);
+  CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf);
+  CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf);
+  CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf);
+  CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf);
+  CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf);
+  CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf);
+  CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf);
+  CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf);
+  CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf);
+  CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf);
+  CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf);
+  CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf);
+  CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf);
+  CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf);
+  CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf);
+  CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf);
+  CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf);
+  CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf);
+  CDF_COPY(kDefaultIsInterCdf, is_inter_cdf);
+  CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf);
+  CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf);
+  CDF_COPY(kDefaultCompoundBackwardReferenceCdf,
+           compound_backward_reference_cdf);
+  CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf);
+  CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf);
+  CDF_COPY(kDefaultNewMvCdf, new_mv_cdf);
+  CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf);
+  CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf);
+  CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf);
+  CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf);
+  CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf);
+  CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf);
+  CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf);
+  CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf);
+  CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf);
+  CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf);
+  CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf);
+  CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf);
+  CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf);
+  for (int i = 0; i < kMvContexts; ++i) {
+    CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]);
+    for (int j = 0; j < kNumMvComponents; ++j) {
+      CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]);
+      CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvClass0HighPrecisionCdf,
+               mv_class0_high_precision_cdf[i][j]);
+      CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]);
+      CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]);
+      CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]);
+    }
+  }
+  const int quantizer_context = GetQuantizerContext(base_quantizer_index);
+  CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf);
+  CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf);
+  CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf);
+  CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf);
+  CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf);
+  CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf);
+  CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf);
+  CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf);
+  CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf);
+  CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf);
+  CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf);
+  CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf);
+  CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf);
+}
+
+void SymbolDecoderContext::ResetIntraFrameYModeCdf() {
+  CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf);
+}
+
+#undef CDF_COPY
+
+// These macros set the last element in the inner-most dimension of the array to
+// zero.
+#define RESET_COUNTER_1D(array)                              \
+  do {                                                       \
+    (array)[std::extent<decltype(array), 0>::value - 1] = 0; \
+  } while (false)
+
+#define RESET_COUNTER_2D(array)                           \
+  do {                                                    \
+    for (auto& d1 : (array)) {                            \
+      d1[std::extent<decltype(array), 1>::value - 1] = 0; \
+    }                                                     \
+  } while (false)
+
+#define RESET_COUNTER_3D(array)                             \
+  do {                                                      \
+    for (auto& d1 : (array)) {                              \
+      for (auto& d2 : d1) {                                 \
+        d2[std::extent<decltype(array), 2>::value - 1] = 0; \
+      }                                                     \
+    }                                                       \
+  } while (false)
+
+#define RESET_COUNTER_4D(array)                               \
+  do {                                                        \
+    for (auto& d1 : (array)) {                                \
+      for (auto& d2 : d1) {                                   \
+        for (auto& d3 : d2) {                                 \
+          d3[std::extent<decltype(array), 3>::value - 1] = 0; \
+        }                                                     \
+      }                                                       \
+    }                                                         \
+  } while (false)
+
+void SymbolDecoderContext::ResetCounters() {
+  ResetPartitionCounters(this);
+  RESET_COUNTER_2D(segment_id_cdf);
+  RESET_COUNTER_2D(use_predicted_segment_id_cdf);
+  RESET_COUNTER_2D(skip_cdf);
+  RESET_COUNTER_2D(skip_mode_cdf);
+  RESET_COUNTER_1D(delta_q_cdf);
+  RESET_COUNTER_1D(delta_lf_cdf);
+  RESET_COUNTER_2D(delta_lf_multi_cdf);
+  RESET_COUNTER_1D(intra_block_copy_cdf);
+  RESET_COUNTER_3D(intra_frame_y_mode_cdf);
+  RESET_COUNTER_2D(y_mode_cdf);
+  RESET_COUNTER_2D(angle_delta_cdf);
+  ResetUVModeCounters(this);
+  RESET_COUNTER_1D(cfl_alpha_signs_cdf);
+  RESET_COUNTER_2D(cfl_alpha_cdf);
+  RESET_COUNTER_2D(use_filter_intra_cdf);
+  RESET_COUNTER_1D(filter_intra_mode_cdf);
+  ResetTxDepthCounters(this);
+  RESET_COUNTER_2D(tx_split_cdf);
+  RESET_COUNTER_3D(all_zero_cdf);
+  ResetTxTypeCounters(this);
+  RESET_COUNTER_3D(eob_pt_16_cdf);
+  RESET_COUNTER_3D(eob_pt_32_cdf);
+  RESET_COUNTER_3D(eob_pt_64_cdf);
+  RESET_COUNTER_3D(eob_pt_128_cdf);
+  RESET_COUNTER_3D(eob_pt_256_cdf);
+  RESET_COUNTER_2D(eob_pt_512_cdf);
+  RESET_COUNTER_2D(eob_pt_1024_cdf);
+  RESET_COUNTER_4D(eob_extra_cdf);
+  RESET_COUNTER_4D(coeff_base_eob_cdf);
+  RESET_COUNTER_4D(coeff_base_cdf);
+  RESET_COUNTER_4D(coeff_base_range_cdf);
+  RESET_COUNTER_3D(dc_sign_cdf);
+  RESET_COUNTER_1D(restoration_type_cdf);
+  RESET_COUNTER_1D(use_wiener_cdf);
+  RESET_COUNTER_1D(use_sgrproj_cdf);
+  RESET_COUNTER_3D(has_palette_y_cdf);
+  RESET_COUNTER_2D(palette_y_size_cdf);
+  RESET_COUNTER_2D(has_palette_uv_cdf);
+  RESET_COUNTER_2D(palette_uv_size_cdf);
+  ResetPaletteColorIndexCounters(this);
+  RESET_COUNTER_2D(is_inter_cdf);
+  RESET_COUNTER_2D(use_compound_reference_cdf);
+  RESET_COUNTER_2D(compound_reference_type_cdf);
+  RESET_COUNTER_4D(compound_reference_cdf);
+  RESET_COUNTER_3D(compound_backward_reference_cdf);
+  RESET_COUNTER_3D(single_reference_cdf);
+  RESET_COUNTER_2D(compound_prediction_mode_cdf);
+  RESET_COUNTER_2D(new_mv_cdf);
+  RESET_COUNTER_2D(zero_mv_cdf);
+  RESET_COUNTER_2D(reference_mv_cdf);
+  RESET_COUNTER_2D(ref_mv_index_cdf);
+  RESET_COUNTER_2D(is_inter_intra_cdf);
+  RESET_COUNTER_2D(inter_intra_mode_cdf);
+  RESET_COUNTER_2D(is_wedge_inter_intra_cdf);
+  RESET_COUNTER_2D(wedge_index_cdf);
+  RESET_COUNTER_2D(use_obmc_cdf);
+  RESET_COUNTER_2D(motion_mode_cdf);
+  RESET_COUNTER_2D(is_explicit_compound_type_cdf);
+  RESET_COUNTER_2D(is_compound_type_average_cdf);
+  RESET_COUNTER_2D(compound_type_cdf);
+  RESET_COUNTER_2D(interpolation_filter_cdf);
+  RESET_COUNTER_2D(mv_joint_cdf);
+  RESET_COUNTER_3D(mv_sign_cdf);
+  RESET_COUNTER_3D(mv_class_cdf);
+  RESET_COUNTER_3D(mv_class0_bit_cdf);
+  RESET_COUNTER_4D(mv_class0_fraction_cdf);
+  RESET_COUNTER_3D(mv_class0_high_precision_cdf);
+  RESET_COUNTER_4D(mv_bit_cdf);
+  RESET_COUNTER_3D(mv_fraction_cdf);
+  RESET_COUNTER_3D(mv_high_precision_cdf);
+}
+
+#undef RESET_COUNTER_1D
+#undef RESET_COUNTER_2D
+#undef RESET_COUNTER_3D
+#undef RESET_COUNTER_4D
+
+int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) {
+  assert(block_size_log2 > 0);
+  assert(block_size_log2 < 6);
+
+  switch (block_size_log2) {
+    case 1:
+      return kPartitionSplit + 1;
+    case 5:
+      return kPartitionVerticalWithRightSplit + 1;
+    default:
+      return kMaxPartitionTypes;
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/symbol_decoder_context.h b/src/symbol_decoder_context.h
new file mode 100644
index 0000000..1bea76c
--- /dev/null
+++ b/src/symbol_decoder_context.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/constants.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+enum {
+  kPartitionContexts = 4,
+  kSegmentIdContexts = 3,
+  kUsePredictedSegmentIdContexts = 3,
+  kSkipContexts = 3,
+  kSkipModeContexts = 3,
+  kBooleanFieldCdfSize = 3,
+  kDeltaSymbolCount = 4,  // Used for both delta_q and delta_lf.
+  kIntraModeContexts = 5,
+  kYModeContexts = 4,
+  kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1,
+  kCflAlphaSignsSymbolCount = 8,
+  kCflAlphaContexts = 6,
+  kCflAlphaSymbolCount = 16,
+  kTxDepthContexts = 3,
+  kMaxTxDepthSymbolCount = 3,
+  kTxSplitContexts = 21,
+  kCoefficientQuantizerContexts = 4,
+  kNumSquareTransformSizes = 5,
+  kAllZeroContexts = 13,
+  kNumExtendedTransformSizes = 4,
+  kEobPtContexts = 2,
+  kEobPt16SymbolCount = 5,
+  kEobPt32SymbolCount = 6,
+  kEobPt64SymbolCount = 7,
+  kEobPt128SymbolCount = 8,
+  kEobPt256SymbolCount = 9,
+  kEobPt512SymbolCount = 10,
+  kEobPt1024SymbolCount = 11,
+  kEobExtraContexts = 9,
+  kCoeffBaseEobContexts = 4,
+  kCoeffBaseEobSymbolCount = 3,
+  kCoeffBaseContexts = 42,
+  kCoeffBaseSymbolCount = 4,
+  kCoeffBaseRangeContexts = 21,
+  kCoeffBaseRangeSymbolCount = 4,
+  kDcSignContexts = 3,
+  kPaletteBlockSizeContexts = 7,
+  kPaletteYModeContexts = 3,
+  kPaletteUVModeContexts = 2,
+  kPaletteSizeSymbolCount = 7,
+  kPaletteColorIndexContexts = 5,
+  kPaletteColorIndexSymbolCount = 8,
+  kIsInterContexts = 4,
+  kUseCompoundReferenceContexts = 5,
+  kCompoundReferenceTypeContexts = 5,
+  kReferenceContexts = 3,
+  kCompoundPredictionModeContexts = 8,
+  kNewMvContexts = 6,
+  kZeroMvContexts = 2,
+  kReferenceMvContexts = 6,
+  kRefMvIndexContexts = 3,
+  kInterIntraContexts = 3,
+  kWedgeIndexSymbolCount = 16,
+  kIsExplicitCompoundTypeContexts = 6,
+  kIsCompoundTypeAverageContexts = 6,
+  kInterpolationFilterContexts = 16,
+  kMvContexts = 2,
+  kMvClassSymbolCount = 11,
+  kMvFractionSymbolCount = 4,
+  kMvBitSymbolCount = 10,
+  kNumMvComponents = 2,
+};  // anonymous enum
+
+struct SymbolDecoderContext {
+  SymbolDecoderContext() = default;
+  explicit SymbolDecoderContext(int base_quantizer_index) {
+    Initialize(base_quantizer_index);
+  }
+
+  void Initialize(int base_quantizer_index);
+
+  // Partition related variables and functions.
+  static int PartitionCdfSize(int block_size_log2);
+
+  // Returns the cdf array index for inter_tx_type or intra_tx_type based on
+  // |tx_set|.
+  static int TxTypeIndex(TransformSet tx_set) {
+    assert(tx_set != kTransformSetDctOnly);
+    switch (tx_set) {
+      case kTransformSetInter1:
+      case kTransformSetIntra1:
+        return 0;
+      case kTransformSetInter2:
+      case kTransformSetIntra2:
+        return 1;
+      case kTransformSetInter3:
+        return 2;
+      default:
+        return -1;
+    }
+  }
+
+  // Resets the intra_frame_y_mode_cdf array to the default.
+  void ResetIntraFrameYModeCdf();
+
+  // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is
+  // the last used element in the innermost dimension of each of the CDF array.
+  void ResetCounters();
+
+  // Note kMaxAlignment allows for aligned instructions to be used in the
+  // copies done in Initialize().
+  alignas(kMaxAlignment) uint16_t
+      partition_cdf[kBlockWidthCount][kPartitionContexts]
+                   [kMaxPartitionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+                            [kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                 [kIntraPredictionModesUV + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+                       [kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                        [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                          [kCoeffBaseRangeContexts]
+                          [kCoeffBaseRangeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                       [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_y_size_cdf[kPaletteBlockSizeContexts]
+                        [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_uv_size_cdf[kPaletteBlockSizeContexts]
+                         [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+                             [kPaletteColorIndexContexts]
+                             [kPaletteColorIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      use_compound_reference_cdf[kUseCompoundReferenceContexts]
+                                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+                                 [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+                            [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_backward_reference_cdf[kReferenceContexts][2]
+                                     [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+                                  [kNumCompoundInterPredictionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+                                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_type_cdf[kMaxBlockSizes]
+                       [kNumExplicitCompoundPredictionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      interpolation_filter_cdf[kInterpolationFilterContexts]
+                              [kNumExplicitInterpolationFilters + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+                            [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+                                                 [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+                           [kBooleanFieldCdfSize];
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
diff --git a/src/symbol_decoder_context_cdfs.inc b/src/symbol_decoder_context_cdfs.inc
new file mode 100644
index 0000000..509286f
--- /dev/null
+++ b/src/symbol_decoder_context_cdfs.inc
@@ -0,0 +1,2509 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is just a convenience to separate out all the CDF constant
+// definitions from the symbol decoder context functions.
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
+    [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
+        // width 8
+        {{13636, 7258, 2376, 0, 0},
+         {18840, 12913, 4228, 0, 0},
+         {20246, 9089, 4139, 0, 0},
+         {22872, 13985, 6915, 0, 0}},
+        // width 16
+        {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0},
+         {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0},
+         {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0},
+         {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}},
+        // width 32
+        {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0},
+         {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0},
+         {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0},
+         {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}},
+        // width 64
+        {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0},
+         {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0},
+         {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0},
+         {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}},
+        // width 128
+        {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0},
+         {26161, 25778, 24500, 708, 549, 430, 397, 0, 0},
+         {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
+         {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+        {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+        {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+        {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
+                                    [kBooleanFieldCdfSize] = {{16384, 0, 0},
+                                                              {16384, 0, 0},
+                                                              {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+        {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
+        {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
+
+// This constant is also used for DeltaLf and DeltaLfMulti.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
+                              [kIntraPredictionModesY + 1] = {
+                                  {{17180, 15741, 13430, 12550, 12086, 11658,
+                                    10943, 9524, 8579, 4603, 3675, 2302, 0, 0},
+                                   {20752, 14702, 13252, 12465, 12049, 11324,
+                                    10880, 9736, 8334, 4110, 2596, 1359, 0, 0},
+                                   {22716, 21997, 10472, 9980, 9713, 9529, 8635,
+                                    7148, 6608, 3432, 2839, 1201, 0, 0},
+                                   {18677, 17362, 16326, 13960, 13632, 13222,
+                                    12770, 10672, 8022, 3183, 1810, 306, 0, 0},
+                                   {20646, 19503, 17165, 16267, 14159, 12735,
+                                    10377, 7185, 6331, 2507, 1695, 293, 0, 0}},
+                                  {{22745, 13183, 11920, 11328, 10936, 10008,
+                                    9679, 8745, 7387, 3754, 2286, 1332, 0, 0},
+                                   {26785, 8669, 8208, 7882, 7702, 6973, 6855,
+                                    6345, 5158, 2863, 1492, 974, 0, 0},
+                                   {25324, 19987, 12591, 12040, 11691, 11161,
+                                    10598, 9363, 8299, 4853, 3678, 2276, 0, 0},
+                                   {24231, 18079, 17336, 15681, 15360, 14596,
+                                    14360, 12943, 8119, 3615, 1672, 558, 0, 0},
+                                   {25225, 18537, 17272, 16573, 14863, 12051,
+                                    10784, 8252, 6767, 3093, 1787, 774, 0, 0}},
+                                  {{20155, 19177, 11385, 10764, 10456, 10191,
+                                    9367, 7713, 7039, 3230, 2463, 691, 0, 0},
+                                   {23081, 19298, 14262, 13538, 13164, 12621,
+                                    12073, 10706, 9549, 5025, 3557, 1861, 0, 0},
+                                   {26585, 26263, 6744, 6516, 6402, 6334, 5686,
+                                    4414, 4213, 2301, 1974, 682, 0, 0},
+                                   {22050, 21034, 17814, 15544, 15203, 14844,
+                                    14207, 11245, 8890, 3793, 2481, 516, 0, 0},
+                                   {23574, 22910, 16267, 15505, 14344, 13597,
+                                    11205, 6807, 6207, 2696, 2031, 305, 0, 0}},
+                                  {{20166, 18369, 17280, 14387, 13990, 13453,
+                                    13044, 11349, 7708, 3072, 1851, 359, 0, 0},
+                                   {24565, 18947, 18244, 15663, 15329, 14637,
+                                    14364, 13300, 7543, 3283, 1610, 426, 0, 0},
+                                   {24317, 23037, 17764, 15125, 14756, 14343,
+                                    13698, 11230, 8163, 3650, 2690, 750, 0, 0},
+                                   {25054, 23720, 23252, 16101, 15951, 15774,
+                                    15615, 14001, 6025, 2379, 1232, 240, 0, 0},
+                                   {23925, 22488, 21272, 17451, 16116, 14825,
+                                    13660, 10050, 6999, 2815, 1785, 283, 0, 0}},
+                                  {{20190, 19097, 16789, 15934, 13693, 11855,
+                                    9779, 7319, 6549, 2554, 1618, 291, 0, 0},
+                                   {23205, 19142, 17688, 16876, 15012, 11905,
+                                    10561, 8532, 7388, 3115, 1625, 491, 0, 0},
+                                   {24412, 23867, 15152, 14512, 13418, 12662,
+                                    10170, 6821, 6302, 2868, 2245, 507, 0, 0},
+                                   {21933, 20953, 19644, 16726, 15750, 14729,
+                                    13821, 10015, 8153, 3279, 1885, 286, 0, 0},
+                                   {25150, 24480, 22909, 22259, 17382, 14111,
+                                    9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
+        {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
+         0, 0},
+        {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784,
+         1916, 0, 0},
+        {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038,
+         2109, 0, 0},
+        {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
+         4719, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
+        {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
+         {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
+         {28988, 21750, 19069, 13414, 9685, 1482, 0, 0},
+         {28187, 21542, 17621, 15630, 10934, 4371, 0, 0},
+         {31031, 21841, 18259, 13180, 10023, 3945, 0, 0},
+         {30104, 22592, 20283, 15118, 11168, 2273, 0, 0},
+         {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
+         {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                     [kIntraPredictionModesUV + 1] = {
+                         // CFL not allowed.
+                         {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845,
+                           4524, 2709, 1827, 807, 0, 0},
+                          {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944,
+                           4409, 3263, 2968, 972, 0, 0},
+                          {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946,
+                           2914, 2004, 991, 739, 0, 0},
+                          {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620,
+                           4191, 2156, 1413, 275, 0, 0},
+                          {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620,
+                           4525, 1667, 1024, 405, 0, 0},
+                          {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310,
+                           3057, 1607, 1327, 218, 0, 0},
+                          {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568,
+                           2523, 931, 426, 101, 0, 0},
+                          {19883, 19730, 17790, 17178, 17095, 17020, 16592,
+                           3640, 3501, 2125, 807, 307, 0, 0},
+                          {20742, 19107, 18894, 17463, 17278, 17042, 16773,
+                           16495, 4325, 2380, 2001, 352, 0, 0},
+                          {13716, 12928, 12189, 11852, 11618, 11301, 10883,
+                           10049, 9594, 3907, 2389, 593, 0, 0},
+                          {14141, 13119, 11794, 11549, 11276, 10952, 10569,
+                           9649, 9241, 5715, 1371, 620, 0, 0},
+                          {15742, 13764, 12771, 12429, 12182, 11665, 11419,
+                           10861, 10286, 6872, 6227, 949, 0, 0},
+                          {20644, 19009, 17809, 17776, 17761, 17717, 17690,
+                           17602, 17513, 17015, 16729, 16162, 0, 0}},
+                         // CFL allowed.
+                         {{22361, 21560, 19868, 19587, 18945, 18593, 17869,
+                           17112, 16782, 12682, 11773, 10313, 8556, 0, 0},
+                          {28236, 12988, 12711, 12553, 12340, 11697, 11569,
+                           11317, 10669, 8540, 8075, 5736, 3296, 0, 0},
+                          {27495, 27389, 12591, 12498, 12383, 12329, 11819,
+                           11073, 10994, 9630, 8512, 8065, 6089, 0, 0},
+                          {26028, 25601, 25106, 18616, 18232, 17983, 17734,
+                           16027, 14397, 11248, 10562, 9379, 8586, 0, 0},
+                          {27781, 27400, 26840, 26700, 13654, 12453, 10911,
+                           10515, 10357, 7857, 7388, 6741, 6392, 0, 0},
+                          {27398, 25879, 25521, 25375, 23270, 11654, 11366,
+                           11015, 10787, 7988, 7382, 6251, 5592, 0, 0},
+                          {27952, 27807, 25564, 25442, 24003, 23838, 12599,
+                           12086, 11965, 9580, 9005, 8313, 7828, 0, 0},
+                          {26160, 26028, 24239, 23719, 23511, 23412, 23033,
+                           13941, 13709, 10432, 9564, 8804, 7975, 0, 0},
+                          {26770, 25349, 24987, 23835, 23513, 23219, 23015,
+                           22351, 13870, 10274, 9629, 8004, 6779, 0, 0},
+                          {22108, 21470, 20218, 19811, 19446, 19144, 18728,
+                           17764, 17234, 12054, 10979, 9325, 7907, 0, 0},
+                          {22246, 21238, 20216, 19805, 19390, 18989, 18523,
+                           17533, 16866, 12666, 10072, 8994, 6930, 0, 0},
+                          {22669, 22077, 20129, 19719, 19382, 19103, 18643,
+                           17605, 17132, 13092, 12294, 9249, 7560, 0, 0},
+                          {29624, 27681, 25386, 25264, 25175, 25078, 24967,
+                           24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+        31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+        {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+         0, 0},
+        {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+         84, 0, 0},
+        {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+         71, 0, 0},
+        {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+         0},
+        {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+         175, 146, 0, 0},
+        {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+         146, 112, 108, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
+        {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
+        {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0},
+        {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+        23819, 19992, 15557, 3210, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
+        {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
+        {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
+        {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
+        {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+        {4187, 0, 0},  {8922, 0, 0},  {11921, 0, 0}, {8453, 0, 0},
+        {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+        {21763, 0, 0}, {5589, 0, 0},  {12764, 0, 0}, {21487, 0, 0},
+        {6219, 0, 0},  {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+        {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0},  {10367, 0, 0},
+        {16680, 0, 0}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+                                 [kNumSquareTransformSizes][kAllZeroContexts]
+                                 [kBooleanFieldCdfSize] = {
+  {
+    {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0},
+     {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0},
+     {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}},
+    {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0},
+     {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0},
+     {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0},
+     {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0},
+     {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}},
+    {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0},
+     {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0},
+     {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0},
+     {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0},
+     {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0},
+     {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0},
+     {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}},
+    {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0},
+     {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0},
+     {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0},
+     {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0},
+     {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0},
+     {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0},
+     {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}},
+    {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0},
+     {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  },
+  {
+    {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0},
+     {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0},
+     {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0},
+     {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}},
+    {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0},
+     {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0},
+     {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}},
+    {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0},
+     {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+     {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
+                                                          1] = {
+        {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
+          8834, 7294, 5041, 3853, 2137, 0, 0},
+         {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360,
+          10251, 7758, 5652, 3912, 2019, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0},
+         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+          10240, 8192, 6144, 4096, 2048, 0, 0}},
+        {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+         // Only 16x16 is used in this case.
+         {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242,
+          2239, 0},
+         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+        {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
+    [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+    [kNumTransformTypes + 1] = {
+        {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
+          {32204, 29433, 23059, 21898, 14625, 4674, 0, 0},
+          {32096, 29521, 29092, 20786, 13353, 9641, 0, 0},
+          {27489, 18883, 17281, 14724, 9241, 2516, 0, 0},
+          {28345, 26694, 24783, 22352, 7075, 3470, 0, 0},
+          {31282, 28527, 23308, 22106, 16312, 5074, 0, 0},
+          {32329, 29930, 29246, 26031, 14710, 9014, 0, 0},
+          {31578, 28535, 27913, 21098, 12487, 8391, 0, 0},
+          {31723, 28456, 24121, 22609, 14124, 3433, 0, 0},
+          {32566, 29034, 28021, 25470, 15641, 8752, 0, 0},
+          {32321, 28456, 25949, 23884, 16758, 8910, 0, 0},
+          {32491, 28399, 27513, 23863, 16303, 10497, 0, 0},
+          {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}},
+         {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0},
+          {32442, 23972, 18136, 17689, 13496, 5282, 0, 0},
+          {32284, 25192, 25056, 18325, 13609, 10177, 0, 0},
+          {31642, 17428, 16873, 15745, 11872, 2489, 0, 0},
+          {32113, 27914, 27519, 26855, 10669, 5630, 0, 0},
+          {31469, 26310, 23883, 23478, 17917, 7271, 0, 0},
+          {32457, 27473, 27216, 25883, 16661, 10096, 0, 0},
+          {31885, 24709, 24498, 21510, 15479, 11219, 0, 0},
+          {32027, 25188, 23450, 22423, 16080, 3722, 0, 0},
+          {32658, 25362, 24853, 23573, 16727, 9439, 0, 0},
+          {32405, 24794, 23411, 22095, 17139, 8294, 0, 0},
+          {32615, 25121, 24656, 22832, 17461, 12772, 0, 0},
+          {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}},
+        {{{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0},
+          {26214, 19661, 13107, 6554, 0, 0}},
+         {{31641, 19954, 9996, 5285, 0, 0},
+          {32623, 26007, 20788, 6101, 0, 0},
+          {32406, 26881, 21090, 16043, 0, 0},
+          {32383, 17555, 14181, 2075, 0, 0},
+          {32743, 29854, 9634, 4865, 0, 0},
+          {32708, 28298, 21019, 8777, 0, 0},
+          {32731, 29436, 18257, 11320, 0, 0},
+          {32611, 26448, 19732, 15329, 0, 0},
+          {32649, 26049, 19862, 3372, 0, 0},
+          {32721, 27231, 20192, 11269, 0, 0},
+          {32499, 26692, 21510, 9653, 0, 0},
+          {32685, 27153, 20767, 15540, 0, 0},
+          {30800, 27212, 20745, 14221, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+                          {{{31928, 31729, 30788, 27873, 0, 0},
+                            {32398, 32097, 30885, 28297, 0, 0}},
+                           {{29521, 27818, 23080, 18205, 0, 0},
+                            {30864, 29414, 25005, 18121, 0, 0}}},
+                          {{{30643, 30217, 27603, 23822, 0, 0},
+                            {32255, 32003, 30909, 26429, 0, 0}},
+                           {{25131, 23270, 18509, 13660, 0, 0},
+                            {30271, 28672, 23902, 15775, 0, 0}}},
+                          {{{28752, 27871, 23887, 17800, 0, 0},
+                            {32052, 31663, 30122, 22712, 0, 0}},
+                           {{21629, 19498, 14527, 9202, 0, 0},
+                            {29576, 27736, 22471, 13013, 0, 0}}},
+                          {{{26060, 23810, 18022, 10635, 0, 0},
+                            {31546, 30694, 27985, 17358, 0, 0}},
+                           {{13193, 11002, 6724, 3059, 0, 0},
+                            {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt32SymbolCount + 1] = {
+                          {{{32368, 32248, 31791, 30666, 26226, 0, 0},
+                            {32558, 32363, 31453, 29442, 25231, 0, 0}},
+                           {{30132, 28495, 25180, 20974, 12367, 0, 0},
+                            {30982, 29589, 25866, 21411, 13714, 0, 0}}},
+                          {{{31779, 31519, 30749, 28617, 21983, 0, 0},
+                            {32455, 32327, 31669, 29851, 24206, 0, 0}},
+                           {{24374, 22416, 18836, 13913, 6754, 0, 0},
+                            {30190, 28644, 24587, 19098, 8534, 0, 0}}},
+                          {{{30253, 29765, 28316, 24606, 16727, 0, 0},
+                            {32194, 31947, 30932, 27679, 19640, 0, 0}},
+                           {{19300, 16465, 12407, 7663, 3487, 0, 0},
+                            {29226, 27266, 22353, 16008, 7124, 0, 0}}},
+                          {{{28151, 27059, 24322, 19184, 9633, 0, 0},
+                            {31612, 31066, 29093, 23494, 12229, 0, 0}},
+                           {{10682, 8486, 5758, 2998, 1025, 0, 0},
+                            {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt64SymbolCount + 1] = {
+                          {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
+                            {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}},
+                           {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0},
+                            {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}},
+                          {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0},
+                            {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}},
+                           {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0},
+                            {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}},
+                          {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0},
+                            {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}},
+                           {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0},
+                            {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}},
+                          {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0},
+                            {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
+                           {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
+                            {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt128SymbolCount + 1] = {
+        {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
+          {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}},
+         {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0},
+          {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}},
+        {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0},
+          {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}},
+         {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0},
+          {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}},
+        {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0},
+          {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}},
+         {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0},
+          {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}},
+        {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0},
+          {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}},
+         {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
+          {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
+    [kEobPt256SymbolCount + 1] = {
+        {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
+          {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}},
+         {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0},
+          {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}},
+        {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0},
+          {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}},
+         {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0},
+          {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}},
+        {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0},
+          {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}},
+         {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0},
+          {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}},
+        {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0},
+          {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}},
+         {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
+          {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
+    [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
+        {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
+          {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
+         {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0},
+          {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}},
+         {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0},
+          {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}},
+         {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
+          {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                        [kEobPt1024SymbolCount + 1] = {
+                            {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
+                              14721, 10197, 6938, 0, 0},
+                             {30903, 30780, 29838, 28526, 22235, 16230, 11414,
+                              5513, 4222, 984, 0, 0}},
+                            {{32072, 31820, 29623, 27066, 23062, 19551, 14917,
+                              10912, 7076, 4734, 0, 0},
+                             {30096, 29177, 23438, 15684, 10043, 8484, 6241,
+                              4741, 4391, 1892, 0, 0}},
+                            {{29984, 28937, 25727, 22247, 17921, 13924, 9613,
+                              6086, 3539, 1723, 0, 0},
+                             {23191, 20302, 15029, 12018, 10707, 9553, 8167,
+                              7285, 6925, 712, 0, 0}},
+                            {{26070, 24434, 20807, 17006, 12582, 8906, 5334,
+                              3442, 1686, 718, 0, 0},
+                             {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044,
+                              2961, 198, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+                                  [kNumSquareTransformSizes][kNumPlaneTypes]
+                                  [kEobExtraContexts][kBooleanFieldCdfSize] = {
+  {
+    {
+      {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0},
+       {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0},
+       {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0},
+       {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0},
+       {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0},
+       {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0},
+       {23963, 0, 0}},
+      {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0},
+       {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0},
+       {4012, 0, 0}}
+    },
+    {
+      {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0},
+       {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0},
+       {11398, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0},
+       {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0},
+       {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0},
+       {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0},
+       {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0},
+       {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0},
+       {17138, 0, 0}},
+      {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0},
+       {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0},
+       {3771, 0, 0}}
+    },
+    {
+      {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0},
+       {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0},
+       {9684, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0},
+       {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0},
+       {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0},
+       {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0},
+       {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0},
+       {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0},
+       {15688, 0, 0}},
+      {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0},
+       {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0},
+       {8047, 0, 0}}
+    },
+    {
+      {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0},
+       {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0},
+       {10110, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0},
+       {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0},
+       {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0},
+       {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}},
+      {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0},
+       {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    },
+    {
+      {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0},
+       {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0},
+       {22212, 0, 0}},
+      {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0},
+       {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0},
+       {13515, 0, 0}}
+    },
+    {
+      {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0},
+       {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0},
+       {17255, 0, 0}},
+      {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+       {16384, 0, 0}}
+    }
+  }
+};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+                                      [kNumSquareTransformSizes][kNumPlaneTypes]
+                                      [kCoeffBaseEobContexts]
+                                      [kCoeffBaseEobSymbolCount + 1] = {
+  {
+    {
+      {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0},
+       {7842, 3820, 0, 0}},
+      {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0},
+       {3138, 887, 0, 0}}
+    },
+    {
+      {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0},
+       {3120, 1277, 0, 0}},
+      {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0},
+       {1469, 345, 0, 0}}
+    },
+    {
+      {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0},
+       {1620, 935, 0, 0}},
+      {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0},
+       {993, 242, 0, 0}}
+    },
+    {
+      {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0},
+       {635, 199, 0, 0}},
+      {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0},
+       {3477, 174, 0, 0}}
+    },
+    {
+      {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0},
+       {356, 119, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0},
+       {5482, 2762, 0, 0}},
+      {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0},
+       {2173, 562, 0, 0}}
+    },
+    {
+      {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0},
+       {1904, 772, 0, 0}},
+      {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0},
+       {839, 174, 0, 0}}
+    },
+    {
+      {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0},
+       {602, 250, 0, 0}},
+      {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0},
+       {838, 205, 0, 0}}
+    },
+    {
+      {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0},
+       {524, 153, 0, 0}},
+      {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0},
+       {3111, 1681, 0, 0}}
+    },
+    {
+      {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0},
+       {1903, 120, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0},
+       {4102, 1898, 0, 0}},
+      {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0},
+       {1330, 262, 0, 0}}
+    },
+    {
+      {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0},
+       {1218, 584, 0, 0}},
+      {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0},
+       {701, 104, 0, 0}}
+    },
+    {
+      {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0},
+       {462, 183, 0, 0}},
+      {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0},
+       {1095, 134, 0, 0}}
+    },
+    {
+      {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0},
+       {982, 152, 0, 0}},
+      {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0},
+       {2240, 194, 0, 0}}
+    },
+    {
+      {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0},
+       {1908, 255, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  },
+  {
+    {
+      {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0},
+       {2380, 778, 0, 0}},
+      {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0},
+       {549, 66, 0, 0}}
+    },
+    {
+      {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0},
+       {1295, 553, 0, 0}},
+      {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0},
+       {526, 49, 0, 0}}
+    },
+    {
+      {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0},
+       {965, 481, 0, 0}},
+      {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0},
+       {1180, 240, 0, 0}}
+    },
+    {
+      {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0},
+       {1499, 245, 0, 0}},
+      {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0},
+       {1001, 56, 0, 0}}
+    },
+    {
+      {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0},
+       {2357, 220, 0, 0}},
+      {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+       {21845, 10923, 0, 0}}
+    }
+  }
+};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
+        {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
+           {20172, 6644, 2275, 0, 0},   {23322, 11650, 5763, 0, 0},
+           {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0},
+           {12101, 2222, 839, 0, 0},    {19725, 6645, 2634, 0, 0},
+           {24617, 14011, 7990, 0, 0},  {27513, 19929, 14136, 0, 0},
+           {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {17032, 5215, 2164, 0, 0},
+           {21558, 8974, 3981, 0, 0},   {26821, 18894, 13067, 0, 0},
+           {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0},
+           {13163, 2375, 1186, 0, 0},   {19245, 6516, 2520, 0, 0},
+           {24322, 14146, 8256, 0, 0},  {28950, 22425, 16794, 0, 0},
+           {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0},
+           {17939, 5641, 2319, 0, 0},   {24455, 15066, 9464, 0, 0},
+           {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0},
+           {10414, 2994, 1396, 0, 0},   {18045, 7296, 3554, 0, 0},
+           {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0},
+           {17572, 4316, 1272, 0, 0},   {22748, 9822, 4254, 0, 0},
+           {26235, 15906, 9267, 0, 0},  {29230, 22952, 17692, 0, 0},
+           {8324, 893, 243, 0, 0},      {16887, 3844, 1133, 0, 0},
+           {22846, 9895, 4302, 0, 0},   {26241, 15802, 9077, 0, 0},
+           {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12567, 1998, 559, 0, 0},
+           {18014, 4697, 1510, 0, 0},   {24390, 12582, 6251, 0, 0},
+           {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0},
+           {8407, 743, 187, 0, 0},      {14095, 2663, 825, 0, 0},
+           {22572, 10524, 5192, 0, 0},  {27273, 18419, 12351, 0, 0},
+           {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0},
+           {14139, 2862, 937, 0, 0},    {23404, 12044, 6453, 0, 0},
+           {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0},
+           {7335, 926, 299, 0, 0},      {13973, 3479, 1357, 0, 0},
+           {25124, 15184, 9176, 0, 0},  {29360, 23754, 17721, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0},
+           {16163, 4720, 1950, 0, 0},   {21760, 9911, 5049, 0, 0},
+           {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0},
+           {8511, 980, 269, 0, 0},      {15888, 3314, 889, 0, 0},
+           {20810, 7714, 2990, 0, 0},   {24852, 14050, 7684, 0, 0},
+           {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0},
+           {17808, 4643, 1433, 0, 0},   {23037, 10558, 4840, 0, 0},
+           {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0},
+           {12393, 2141, 637, 0, 0},    {18864, 5484, 1881, 0, 0},
+           {23400, 11210, 5624, 0, 0},  {26831, 17802, 11649, 0, 0},
+           {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0},
+           {15595, 3034, 750, 0, 0},    {19973, 7327, 2803, 0, 0},
+           {23787, 13088, 6875, 0, 0},  {28040, 21396, 15866, 0, 0},
+           {8481, 971, 329, 0, 0},      {16065, 3623, 1072, 0, 0},
+           {21935, 9214, 4043, 0, 0},   {26300, 16202, 9711, 0, 0},
+           {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0},
+           {14178, 2270, 651, 0, 0},    {20348, 7012, 2818, 0, 0},
+           {25129, 14022, 8058, 0, 0},  {29767, 24682, 20421, 0, 0},
+           {7692, 704, 188, 0, 0},      {14822, 2640, 740, 0, 0},
+           {20744, 7783, 3390, 0, 0},   {25251, 14378, 8464, 0, 0},
+           {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0},
+           {15938, 4179, 1712, 0, 0},   {22166, 9940, 5008, 0, 0},
+           {26035, 15939, 9697, 0, 0},  {29518, 23854, 19212, 0, 0},
+           {7186, 548, 100, 0, 0},      {14109, 2426, 545, 0, 0},
+           {20222, 6619, 2253, 0, 0},   {24348, 12317, 5967, 0, 0},
+           {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0},
+           {13781, 2685, 790, 0, 0},    {21441, 8520, 3684, 0, 0},
+           {25504, 15049, 8648, 0, 0},  {28773, 22000, 16599, 0, 0},
+           {6875, 937, 281, 0, 0},      {16191, 4181, 1389, 0, 0},
+           {22579, 10020, 4586, 0, 0},  {25936, 15674, 9212, 0, 0},
+           {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0},
+           {13047, 1976, 492, 0, 0},    {19949, 6525, 2357, 0, 0},
+           {24196, 12154, 5877, 0, 0},  {27404, 18709, 12301, 0, 0},
+           {6188, 330, 91, 0, 0},       {11916, 1543, 428, 0, 0},
+           {20333, 7068, 2801, 0, 0},   {24077, 11943, 5792, 0, 0},
+           {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0},
+           {11396, 1791, 496, 0, 0},    {20095, 7498, 2915, 0, 0},
+           {23560, 11843, 6128, 0, 0},  {27750, 19417, 14036, 0, 0},
+           {5417, 289, 55, 0, 0},       {11370, 1559, 381, 0, 0},
+           {20606, 7721, 2926, 0, 0},   {24872, 14077, 7449, 0, 0},
+           {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0},
+           {21731, 10815, 6292, 0, 0},  {24621, 14806, 9816, 0, 0},
+           {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0},
+           {5994, 635, 178, 0, 0},      {14924, 3204, 1001, 0, 0},
+           {21078, 8330, 3597, 0, 0},   {25226, 14553, 8309, 0, 0},
+           {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0},
+           {14117, 2642, 814, 0, 0},    {20604, 7622, 3179, 0, 0},
+           {25006, 14238, 7997, 0, 0},  {29276, 23585, 18848, 0, 0},
+           {5177, 760, 277, 0, 0},      {15619, 3915, 1258, 0, 0},
+           {21283, 8765, 3908, 0, 0},   {25071, 14682, 8558, 0, 0},
+           {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0},
+           {13137, 1717, 364, 0, 0},    {18908, 5508, 1748, 0, 0},
+           {23163, 11155, 5174, 0, 0},  {27892, 20606, 14860, 0, 0},
+           {5520, 452, 192, 0, 0},      {13813, 2311, 693, 0, 0},
+           {20944, 8771, 3973, 0, 0},   {25422, 14572, 8121, 0, 0},
+           {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0},
+           {11599, 1374, 351, 0, 0},    {19281, 5570, 1811, 0, 0},
+           {23940, 11085, 5154, 0, 0},  {28498, 21317, 15730, 0, 0},
+           {4060, 190, 37, 0, 0},       {12648, 1527, 286, 0, 0},
+           {19076, 5218, 1447, 0, 0},   {23350, 10254, 4329, 0, 0},
+           {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0},
+           {19986, 8985, 4965, 0, 0},   {23641, 12111, 6960, 0, 0},
+           {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0},
+           {2807, 205, 49, 0, 0},       {14450, 2877, 819, 0, 0},
+           {21407, 8254, 3411, 0, 0},   {24868, 13165, 7161, 0, 0},
+           {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0},
+           {14472, 2855, 959, 0, 0},    {22624, 11253, 5897, 0, 0},
+           {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0},
+           {4116, 298, 92, 0, 0},       {15230, 1997, 559, 0, 0},
+           {18844, 5886, 2274, 0, 0},   {22272, 9931, 4899, 0, 0},
+           {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0},
+           {9762, 1092, 279, 0, 0},     {18274, 4940, 1648, 0, 0},
+           {22594, 9967, 4416, 0, 0},   {26526, 17487, 11725, 0, 0},
+           {6951, 525, 48, 0, 0},       {14150, 1401, 443, 0, 0},
+           {18771, 4450, 890, 0, 0},    {20513, 6234, 1385, 0, 0},
+           {23207, 11180, 4318, 0, 0},  {4580, 133, 44, 0, 0},
+           {10708, 403, 40, 0, 0},      {14666, 2078, 240, 0, 0},
+           {18572, 3904, 769, 0, 0},    {20506, 6976, 1903, 0, 0},
+           {8592, 659, 140, 0, 0},      {14488, 3087, 805, 0, 0},
+           {22563, 9065, 3104, 0, 0},   {24879, 12743, 5092, 0, 0},
+           {26708, 16025, 8798, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0},
+           {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0},
+           {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0},
+           {6685, 1615, 332, 0, 0},     {19282, 8165, 4285, 0, 0},
+           {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0},
+           {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0},
+           {16871, 5216, 2478, 0, 0},   {24180, 12721, 7385, 0, 0},
+           {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0},
+           {4603, 571, 251, 0, 0},      {12033, 2341, 1200, 0, 0},
+           {18443, 8097, 5076, 0, 0},   {27649, 20214, 14963, 0, 0},
+           {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0},
+           {9416, 1002, 223, 0, 0},     {18099, 5198, 1709, 0, 0},
+           {24276, 11874, 5496, 0, 0},  {29124, 22574, 17564, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0},
+           {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0},
+           {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0},
+           {8781, 2066, 651, 0, 0},     {19214, 8197, 3505, 0, 0},
+           {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0},
+           {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0},
+           {18011, 2403, 814, 0, 0},    {28363, 21156, 14215, 0, 0},
+           {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0},
+           {2760, 486, 177, 0, 0},      {13524, 2660, 1020, 0, 0},
+           {21588, 8610, 3213, 0, 0},   {27118, 17796, 13559, 0, 0},
+           {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0},
+           {9756, 1104, 196, 0, 0},     {19074, 6112, 2132, 0, 0},
+           {24626, 13260, 6675, 0, 0},  {28515, 21813, 16044, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0},
+           {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0},
+           {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0},
+           {10486, 3058, 874, 0, 0},    {24260, 11842, 6784, 0, 0},
+           {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0},
+           {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0},
+           {14464, 2304, 768, 0, 0},    {21325, 6242, 3121, 0, 0},
+           {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0},
+           {1814, 591, 197, 0, 0},      {15405, 3206, 1692, 0, 0},
+           {23082, 10304, 5358, 0, 0},  {24576, 16384, 11378, 0, 0},
+           {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0},
+           {10282, 1327, 297, 0, 0},    {19935, 7141, 3030, 0, 0},
+           {25788, 15389, 9646, 0, 0},  {29657, 23881, 19289, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0},
+           {18604, 5937, 2043, 0, 0},   {23008, 12121, 6183, 0, 0},
+           {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0},
+           {9201, 1394, 514, 0, 0},     {17790, 5352, 1822, 0, 0},
+           {23334, 12543, 6514, 0, 0},  {26110, 18210, 12233, 0, 0},
+           {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {14680, 3223, 1181, 0, 0},
+           {19706, 6925, 2695, 0, 0},   {23828, 15941, 10517, 0, 0},
+           {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0},
+           {9889, 1380, 654, 0, 0},     {17553, 4775, 1813, 0, 0},
+           {23371, 13323, 7790, 0, 0},  {29326, 22955, 17424, 0, 0},
+           {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0},
+           {15996, 4805, 2050, 0, 0},   {23349, 14603, 9508, 0, 0},
+           {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0},
+           {6212, 1314, 667, 0, 0},     {15640, 5733, 2660, 0, 0},
+           {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24313, 13765, 8400, 0, 0},  {9205, 747, 164, 0, 0},
+           {16531, 3322, 833, 0, 0},    {22044, 8769, 3410, 0, 0},
+           {26043, 15240, 8352, 0, 0},  {28841, 21841, 15943, 0, 0},
+           {6455, 480, 134, 0, 0},      {15338, 2673, 673, 0, 0},
+           {21652, 8162, 3089, 0, 0},   {25573, 14384, 7499, 0, 0},
+           {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {9946, 1120, 285, 0, 0},
+           {16044, 3135, 839, 0, 0},    {22507, 9735, 4043, 0, 0},
+           {25739, 14928, 8240, 0, 0},  {27901, 18882, 11266, 0, 0},
+           {7470, 876, 277, 0, 0},      {14959, 3438, 1256, 0, 0},
+           {23100, 11439, 6189, 0, 0},  {27994, 19812, 13792, 0, 0},
+           {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0},
+           {14811, 3381, 1136, 0, 0},   {23572, 12175, 6368, 0, 0},
+           {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0},
+           {6297, 709, 194, 0, 0},      {14310, 2985, 859, 0, 0},
+           {24368, 13304, 6812, 0, 0},  {28956, 21795, 15562, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0},
+           {15152, 3721, 1396, 0, 0},   {21705, 9593, 4765, 0, 0},
+           {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0},
+           {7502, 401, 131, 0, 0},      {13714, 2215, 593, 0, 0},
+           {20629, 7556, 2961, 0, 0},   {25457, 14606, 8064, 0, 0},
+           {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0},
+           {16515, 3856, 1242, 0, 0},   {23617, 11381, 5396, 0, 0},
+           {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0},
+           {9624, 913, 325, 0, 0},      {16698, 4277, 1443, 0, 0},
+           {24066, 12301, 6251, 0, 0},  {27525, 18812, 12401, 0, 0},
+           {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0},
+           {12778, 1718, 427, 0, 0},    {19525, 6663, 2453, 0, 0},
+           {24180, 13247, 6850, 0, 0},  {28051, 21183, 15464, 0, 0},
+           {6924, 476, 186, 0, 0},      {13678, 2133, 671, 0, 0},
+           {20805, 8222, 3829, 0, 0},   {26550, 16681, 10414, 0, 0},
+           {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0},
+           {11590, 1455, 472, 0, 0},    {19282, 6584, 2898, 0, 0},
+           {25619, 14897, 9045, 0, 0},  {29935, 24810, 20509, 0, 0},
+           {5058, 240, 82, 0, 0},       {12094, 1692, 500, 0, 0},
+           {20355, 7813, 3525, 0, 0},   {26092, 15841, 9671, 0, 0},
+           {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24129, 13429, 8339, 0, 0},  {8364, 931, 243, 0, 0},
+           {15771, 3343, 984, 0, 0},    {21515, 8534, 3619, 0, 0},
+           {26017, 15374, 8740, 0, 0},  {29278, 22938, 17577, 0, 0},
+           {6485, 297, 54, 0, 0},       {13169, 1600, 326, 0, 0},
+           {19622, 5814, 1875, 0, 0},   {24554, 12180, 5878, 0, 0},
+           {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0},
+           {14174, 2452, 668, 0, 0},    {21549, 8360, 3534, 0, 0},
+           {25903, 15112, 8619, 0, 0},  {29090, 22406, 16762, 0, 0},
+           {6943, 632, 152, 0, 0},      {15455, 2915, 747, 0, 0},
+           {21571, 8297, 3296, 0, 0},   {25821, 14987, 8363, 0, 0},
+           {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0},
+           {11918, 1300, 299, 0, 0},    {18747, 5061, 1635, 0, 0},
+           {23804, 11020, 4930, 0, 0},  {27331, 18103, 11581, 0, 0},
+           {6464, 276, 70, 0, 0},       {12359, 1388, 383, 0, 0},
+           {19086, 5546, 2136, 0, 0},   {23794, 11532, 6083, 0, 0},
+           {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0},
+           {12096, 1526, 327, 0, 0},    {18596, 5514, 1866, 0, 0},
+           {22898, 10870, 5493, 0, 0},  {27604, 19262, 13498, 0, 0},
+           {6043, 309, 40, 0, 0},       {11777, 1326, 241, 0, 0},
+           {19697, 6334, 1957, 0, 0},   {24584, 12678, 6026, 0, 0},
+           {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0},
+           {18374, 6030, 2515, 0, 0},   {24355, 13214, 7573, 0, 0},
+           {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0},
+           {5992, 404, 105, 0, 0},      {14036, 2801, 837, 0, 0},
+           {21763, 8982, 3916, 0, 0},   {26302, 15859, 9258, 0, 0},
+           {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0},
+           {12700, 1911, 560, 0, 0},    {20765, 7683, 3173, 0, 0},
+           {25821, 15018, 8579, 0, 0},  {29523, 23665, 18761, 0, 0},
+           {5409, 303, 99, 0, 0},       {13347, 2154, 594, 0, 0},
+           {20853, 7758, 3189, 0, 0},   {25818, 15092, 8694, 0, 0},
+           {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0},
+           {10666, 919, 192, 0, 0},     {18360, 4759, 1363, 0, 0},
+           {23741, 11089, 4837, 0, 0},  {28074, 20090, 14020, 0, 0},
+           {4552, 240, 86, 0, 0},       {11919, 1504, 450, 0, 0},
+           {20012, 6953, 3017, 0, 0},   {25203, 13967, 7845, 0, 0},
+           {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0},
+           {9705, 858, 253, 0, 0},      {18180, 4717, 1636, 0, 0},
+           {23683, 11119, 5311, 0, 0},  {28507, 21114, 15504, 0, 0},
+           {3250, 77, 20, 0, 0},        {10317, 809, 155, 0, 0},
+           {17904, 4046, 1068, 0, 0},   {23073, 9804, 4052, 0, 0},
+           {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0},
+           {17994, 5619, 2161, 0, 0},   {23511, 11330, 5796, 0, 0},
+           {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0},
+           {5779, 506, 86, 0, 0},       {15372, 2831, 683, 0, 0},
+           {21381, 7867, 2984, 0, 0},   {25479, 13947, 7220, 0, 0},
+           {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0},
+           {15337, 3067, 865, 0, 0},    {22847, 9942, 4468, 0, 0},
+           {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0},
+           {4154, 257, 63, 0, 0},       {13404, 2130, 505, 0, 0},
+           {19639, 6514, 2366, 0, 0},   {24014, 12284, 6328, 0, 0},
+           {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0},
+           {10988, 1165, 267, 0, 0},    {18454, 4939, 1477, 0, 0},
+           {23157, 10441, 4505, 0, 0},  {27878, 19681, 13703, 0, 0},
+           {6906, 201, 35, 0, 0},       {11974, 718, 201, 0, 0},
+           {15525, 2143, 514, 0, 0},    {19485, 5140, 1294, 0, 0},
+           {23099, 10236, 3850, 0, 0},  {5333, 71, 20, 0, 0},
+           {7846, 378, 54, 0, 0},       {11319, 1264, 232, 0, 0},
+           {16376, 3039, 936, 0, 0},    {21076, 7884, 3692, 0, 0},
+           {8575, 478, 33, 0, 0},       {13859, 1664, 205, 0, 0},
+           {20532, 5927, 1365, 0, 0},   {24597, 10928, 3686, 0, 0},
+           {25544, 15488, 7493, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0},
+           {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0},
+           {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0},
+           {6951, 1122, 239, 0, 0},     {19060, 6430, 2383, 0, 0},
+           {25440, 14183, 7898, 0, 0},  {28077, 19688, 13492, 0, 0},
+           {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0},
+           {15608, 3767, 1408, 0, 0},   {23166, 10906, 5372, 0, 0},
+           {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0},
+           {3522, 318, 105, 0, 0},      {14072, 2839, 950, 0, 0},
+           {22258, 9399, 4208, 0, 0},   {26539, 16269, 9643, 0, 0},
+           {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0},
+           {11130, 1281, 265, 0, 0},    {19831, 5914, 1898, 0, 0},
+           {24586, 12172, 5798, 0, 0},  {29131, 22499, 17271, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0},
+           {25012, 14451, 9033, 0, 0},  {29316, 23512, 19622, 0, 0},
+           {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0},
+           {5717, 910, 237, 0, 0},      {16780, 5237, 2149, 0, 0},
+           {23580, 11284, 6049, 0, 0},  {26495, 15582, 8968, 0, 0},
+           {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0},
+           {14649, 2731, 918, 0, 0},    {22524, 9799, 5296, 0, 0},
+           {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0},
+           {2708, 187, 48, 0, 0},       {11757, 1993, 648, 0, 0},
+           {20837, 7948, 3479, 0, 0},   {25649, 15106, 8412, 0, 0},
+           {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0},
+           {8855, 1044, 279, 0, 0},     {17248, 4708, 1482, 0, 0},
+           {21251, 9760, 4197, 0, 0},   {26575, 18260, 12139, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0},
+           {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0},
+           {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0},
+           {9942, 2349, 633, 0, 0},     {22373, 11006, 5826, 0, 0},
+           {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0},
+           {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0},
+           {15853, 5014, 2395, 0, 0},   {23620, 11778, 6337, 0, 0},
+           {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0},
+           {2166, 291, 98, 0, 0},       {12742, 2813, 1200, 0, 0},
+           {21548, 9140, 4663, 0, 0},   {26116, 15749, 9795, 0, 0},
+           {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0},
+           {10538, 1881, 395, 0, 0},    {20534, 7689, 3037, 0, 0},
+           {25442, 13952, 7415, 0, 0},  {28835, 21861, 16152, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0},
+           {17595, 5013, 1447, 0, 0},   {22610, 11535, 5386, 0, 0},
+           {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0},
+           {7933, 759, 272, 0, 0},      {16259, 4347, 1189, 0, 0},
+           {21811, 11254, 5350, 0, 0},  {24887, 16838, 10672, 0, 0},
+           {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {12023, 1995, 675, 0, 0},
+           {17568, 5547, 1907, 0, 0},   {19736, 11895, 7101, 0, 0},
+           {20483, 14105, 9274, 0, 0},  {21205, 15287, 11279, 0, 0},
+           {6508, 786, 448, 0, 0},      {17371, 4685, 1668, 0, 0},
+           {23026, 13551, 7944, 0, 0},  {29507, 23139, 17406, 0, 0},
+           {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0},
+           {15911, 5109, 1994, 0, 0},   {23217, 14478, 9020, 0, 0},
+           {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0},
+           {3763, 753, 376, 0, 0},      {15091, 5074, 1905, 0, 0},
+           {23564, 15412, 9549, 0, 0},  {30365, 25252, 19954, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21960, 10712, 5872, 0, 0},  {7029, 455, 92, 0, 0},
+           {15480, 2565, 547, 0, 0},    {21409, 7890, 2872, 0, 0},
+           {25819, 15001, 7875, 0, 0},  {28481, 20972, 14697, 0, 0},
+           {4888, 247, 63, 0, 0},       {13730, 1764, 354, 0, 0},
+           {20204, 6423, 2000, 0, 0},   {24499, 12821, 5989, 0, 0},
+           {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {7026, 449, 97, 0, 0},
+           {13211, 1604, 314, 0, 0},    {19387, 6387, 2013, 0, 0},
+           {22667, 11302, 6046, 0, 0},  {23559, 13118, 5943, 0, 0},
+           {5661, 851, 336, 0, 0},      {14712, 3875, 1565, 0, 0},
+           {22568, 11334, 6004, 0, 0},  {28108, 19855, 13266, 0, 0},
+           {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0},
+           {14140, 2763, 737, 0, 0},    {22535, 10326, 4536, 0, 0},
+           {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0},
+           {5072, 328, 76, 0, 0},       {12736, 1601, 330, 0, 0},
+           {24068, 11427, 4326, 0, 0},  {27106, 17937, 10973, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0},
+           {14386, 3148, 949, 0, 0},    {21877, 9293, 4045, 0, 0},
+           {26410, 16185, 9459, 0, 0},  {29520, 23650, 18627, 0, 0},
+           {5564, 195, 69, 0, 0},       {12950, 1944, 439, 0, 0},
+           {20996, 7648, 2727, 0, 0},   {25773, 14735, 7729, 0, 0},
+           {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0},
+           {17412, 4369, 1293, 0, 0},   {23947, 12133, 5711, 0, 0},
+           {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0},
+           {7893, 648, 239, 0, 0},      {17535, 4503, 1323, 0, 0},
+           {24163, 12198, 5836, 0, 0},  {27337, 18355, 11572, 0, 0},
+           {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0},
+           {11727, 1322, 312, 0, 0},    {19547, 6555, 2293, 0, 0},
+           {24513, 13383, 6731, 0, 0},  {27838, 20183, 13938, 0, 0},
+           {4000, 320, 141, 0, 0},      {13063, 2207, 747, 0, 0},
+           {21196, 9179, 4548, 0, 0},   {27236, 17734, 11322, 0, 0},
+           {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0},
+           {11147, 1697, 567, 0, 0},    {20257, 8021, 3776, 0, 0},
+           {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0},
+           {3053, 143, 56, 0, 0},       {11810, 1757, 485, 0, 0},
+           {21535, 9097, 3962, 0, 0},   {26756, 16640, 9900, 0, 0},
+           {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{21752, 10657, 5974, 0, 0},  {6822, 411, 91, 0, 0},
+           {14878, 2316, 516, 0, 0},    {21090, 7626, 2952, 0, 0},
+           {26048, 15234, 8184, 0, 0},  {28538, 21103, 14948, 0, 0},
+           {4368, 145, 21, 0, 0},       {11604, 1100, 193, 0, 0},
+           {19196, 5380, 1586, 0, 0},   {24534, 12018, 5410, 0, 0},
+           {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0},
+           {14087, 2225, 529, 0, 0},    {21849, 8693, 3482, 0, 0},
+           {26337, 15569, 8691, 0, 0},  {28949, 22304, 16150, 0, 0},
+           {5898, 301, 75, 0, 0},       {13727, 1937, 421, 0, 0},
+           {20974, 7557, 2752, 0, 0},   {25880, 14749, 7798, 0, 0},
+           {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0},
+           {9609, 761, 155, 0, 0},      {17453, 4099, 1092, 0, 0},
+           {23470, 10161, 3986, 0, 0},  {26624, 16855, 9800, 0, 0},
+           {4658, 269, 99, 0, 0},       {11194, 1831, 753, 0, 0},
+           {20009, 7950, 4041, 0, 0},   {26223, 16007, 9726, 0, 0},
+           {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0},
+           {10667, 1299, 304, 0, 0},    {19608, 7296, 2625, 0, 0},
+           {25465, 14084, 7300, 0, 0},  {27527, 18793, 11813, 0, 0},
+           {4368, 137, 24, 0, 0},       {10664, 975, 165, 0, 0},
+           {19211, 6197, 1922, 0, 0},   {25019, 12907, 6093, 0, 0},
+           {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}},
+         {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0},
+           {17730, 4916, 1762, 0, 0},   {24050, 12204, 6282, 0, 0},
+           {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0},
+           {5217, 264, 67, 0, 0},       {14458, 2714, 668, 0, 0},
+           {22557, 9348, 3686, 0, 0},   {26546, 15892, 8852, 0, 0},
+           {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0},
+           {12885, 2017, 567, 0, 0},    {21627, 8584, 3483, 0, 0},
+           {26348, 15828, 8994, 0, 0},  {29376, 23015, 17650, 0, 0},
+           {4303, 152, 56, 0, 0},       {12918, 2066, 524, 0, 0},
+           {21785, 8744, 3545, 0, 0},   {26474, 15998, 9186, 0, 0},
+           {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0},
+           {9828, 736, 142, 0, 0},      {18486, 4840, 1295, 0, 0},
+           {24206, 11441, 4854, 0, 0},  {27922, 19375, 12849, 0, 0},
+           {2787, 178, 73, 0, 0},       {12303, 1805, 602, 0, 0},
+           {21289, 9189, 4573, 0, 0},   {26852, 17120, 10695, 0, 0},
+           {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0},
+           {9662, 1044, 324, 0, 0},     {18985, 6030, 2329, 0, 0},
+           {24916, 13300, 6961, 0, 0},  {28908, 21644, 15915, 0, 0},
+           {1754, 44, 20, 0, 0},        {9139, 659, 140, 0, 0},
+           {18021, 4653, 1365, 0, 0},   {24223, 11526, 5290, 0, 0},
+           {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{23583, 13074, 8080, 0, 0},  {6687, 783, 147, 0, 0},
+           {16753, 3768, 981, 0, 0},    {22226, 9078, 3562, 0, 0},
+           {26036, 14823, 8091, 0, 0},  {28852, 21729, 16046, 0, 0},
+           {4544, 202, 24, 0, 0},       {13668, 1630, 283, 0, 0},
+           {20240, 6148, 1889, 0, 0},   {25027, 12491, 5883, 0, 0},
+           {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0},
+           {15098, 2435, 613, 0, 0},    {22383, 9168, 3859, 0, 0},
+           {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0},
+           {4391, 207, 30, 0, 0},       {13402, 1593, 286, 0, 0},
+           {19441, 5593, 1674, 0, 0},   {24510, 11999, 5625, 0, 0},
+           {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0},
+           {9915, 866, 185, 0, 0},      {18009, 4582, 1349, 0, 0},
+           {23484, 10386, 4420, 0, 0},  {27183, 17576, 10900, 0, 0},
+           {4477, 116, 22, 0, 0},       {12919, 661, 197, 0, 0},
+           {17934, 5950, 3554, 0, 0},   {22462, 10174, 4096, 0, 0},
+           {26153, 15384, 9384, 0, 0},  {3821, 164, 23, 0, 0},
+           {7143, 479, 122, 0, 0},      {14010, 4096, 1365, 0, 0},
+           {22751, 9338, 4245, 0, 0},   {25906, 17499, 10637, 0, 0},
+           {8835, 259, 29, 0, 0},       {12841, 1273, 137, 0, 0},
+           {20865, 6745, 2147, 0, 0},   {25742, 12674, 5516, 0, 0},
+           {26770, 14662, 8331, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0},
+           {21595, 8994, 4201, 0, 0},   {25486, 14475, 8505, 0, 0},
+           {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0},
+           {6117, 632, 121, 0, 0},      {18138, 4514, 1313, 0, 0},
+           {24052, 11481, 5373, 0, 0},  {27153, 17437, 10760, 0, 0},
+           {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0},
+           {16642, 3786, 1135, 0, 0},   {23738, 11407, 5416, 0, 0},
+           {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0},
+           {3229, 167, 38, 0, 0},       {14643, 2383, 567, 0, 0},
+           {22346, 8678, 3300, 0, 0},   {26300, 15281, 8330, 0, 0},
+           {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0},
+           {12102, 1395, 271, 0, 0},    {20259, 6128, 1851, 0, 0},
+           {24710, 12139, 5478, 0, 0},  {28537, 20762, 14716, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{22566, 12135, 7284, 0, 0},  {5432, 1323, 416, 0, 0},
+           {20348, 8384, 4216, 0, 0},   {25120, 14653, 8912, 0, 0},
+           {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0},
+           {1823, 152, 32, 0, 0},       {14086, 2263, 515, 0, 0},
+           {21255, 7432, 2565, 0, 0},   {25319, 13316, 6620, 0, 0},
+           {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0},
+           {14190, 2267, 622, 0, 0},    {21519, 9400, 4137, 0, 0},
+           {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0},
+           {1411, 58, 20, 0, 0},        {11216, 1274, 264, 0, 0},
+           {18877, 5091, 1428, 0, 0},   {23717, 10670, 4596, 0, 0},
+           {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0},
+           {7929, 861, 217, 0, 0},      {15608, 3989, 1072, 0, 0},
+           {20316, 8631, 3166, 0, 0},   {26603, 17379, 10291, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0},
+           {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0},
+           {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0},
+           {8048, 1529, 309, 0, 0},     {20183, 7412, 2800, 0, 0},
+           {25587, 14522, 8324, 0, 0},  {27743, 19101, 12883, 0, 0},
+           {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0},
+           {15347, 3612, 1193, 0, 0},   {22879, 10580, 4986, 0, 0},
+           {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0},
+           {2585, 200, 55, 0, 0},       {14240, 2573, 719, 0, 0},
+           {21786, 8162, 3111, 0, 0},   {25811, 14603, 7537, 0, 0},
+           {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0},
+           {11727, 1440, 222, 0, 0},    {20200, 6036, 1602, 0, 0},
+           {24716, 12048, 5035, 0, 0},  {28432, 20576, 14372, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}},
+        {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0},
+           {19093, 4727, 989, 0, 0},    {24178, 12094, 5137, 0, 0},
+           {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0},
+           {6275, 350, 110, 0, 0},      {16392, 3426, 678, 0, 0},
+           {22174, 10119, 3798, 0, 0},  {24592, 15598, 8465, 0, 0},
+           {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {8880, 866, 226, 0, 0},
+           {14156, 3081, 781, 0, 0},    {16523, 7916, 3519, 0, 0},
+           {17003, 10160, 5209, 0, 0},  {12873, 8069, 5258, 0, 0},
+           {4367, 556, 311, 0, 0},      {17494, 4943, 1788, 0, 0},
+           {23404, 14640, 8436, 0, 0},  {30485, 24575, 17686, 0, 0},
+           {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0},
+           {14787, 4523, 1380, 0, 0},   {21847, 12670, 6528, 0, 0},
+           {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0},
+           {1700, 302, 133, 0, 0},      {12447, 3196, 797, 0, 0},
+           {21997, 12513, 5649, 0, 0},  {29973, 22358, 15407, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{23448, 10666, 4928, 0, 0},  {5711, 304, 44, 0, 0},
+           {16437, 2500, 459, 0, 0},    {22449, 8833, 3048, 0, 0},
+           {26579, 16320, 8662, 0, 0},  {29179, 21884, 13960, 0, 0},
+           {3742, 144, 20, 0, 0},       {13542, 1261, 181, 0, 0},
+           {20076, 5847, 1565, 0, 0},   {25719, 13236, 5133, 0, 0},
+           {25041, 17099, 9516, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {4712, 143, 20, 0, 0},
+           {10385, 693, 99, 0, 0},      {17351, 5670, 1019, 0, 0},
+           {14641, 6275, 5578, 0, 0},   {27307, 16384, 10923, 0, 0},
+           {4786, 677, 184, 0, 0},      {13723, 2900, 796, 0, 0},
+           {22371, 10502, 4836, 0, 0},  {26778, 19071, 11268, 0, 0},
+           {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0},
+           {11234, 1247, 199, 0, 0},    {21659, 7551, 2751, 0, 0},
+           {27097, 17644, 6617, 0, 0},  {28087, 18725, 14043, 0, 0},
+           {4080, 188, 27, 0, 0},       {10192, 689, 107, 0, 0},
+           {22141, 10627, 4428, 0, 0},  {23406, 18725, 4681, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0},
+           {17105, 3543, 774, 0, 0},    {22890, 9480, 3610, 0, 0},
+           {26349, 15680, 8432, 0, 0},  {28909, 21765, 15729, 0, 0},
+           {5206, 173, 43, 0, 0},       {15193, 2180, 369, 0, 0},
+           {21949, 7930, 2459, 0, 0},   {25644, 14082, 6852, 0, 0},
+           {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0},
+           {17462, 3763, 830, 0, 0},    {23831, 11153, 4446, 0, 0},
+           {26786, 17165, 9982, 0, 0},  {29148, 22501, 16632, 0, 0},
+           {5488, 304, 101, 0, 0},      {17161, 3608, 764, 0, 0},
+           {23677, 10633, 4028, 0, 0},  {26536, 16136, 8748, 0, 0},
+           {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0},
+           {13118, 1548, 306, 0, 0},    {19718, 6456, 1941, 0, 0},
+           {23540, 11898, 5300, 0, 0},  {26622, 17619, 10797, 0, 0},
+           {2599, 287, 145, 0, 0},      {15556, 3457, 1214, 0, 0},
+           {22857, 11457, 5886, 0, 0},  {28281, 19454, 12396, 0, 0},
+           {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0},
+           {13278, 2562, 661, 0, 0},    {21536, 8770, 3492, 0, 0},
+           {25999, 14813, 7733, 0, 0},  {28370, 20145, 13554, 0, 0},
+           {2159, 141, 46, 0, 0},       {13398, 2186, 481, 0, 0},
+           {22311, 9149, 3359, 0, 0},   {26325, 15131, 7934, 0, 0},
+           {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24142, 12497, 6552, 0, 0},  {6061, 362, 57, 0, 0},
+           {15769, 2439, 482, 0, 0},    {21323, 7645, 2482, 0, 0},
+           {26357, 13940, 7167, 0, 0},  {25967, 20310, 12520, 0, 0},
+           {2850, 86, 20, 0, 0},        {12119, 1029, 150, 0, 0},
+           {19889, 4995, 1187, 0, 0},   {24872, 11017, 4524, 0, 0},
+           {27508, 17898, 9070, 0, 0},  {3516, 175, 37, 0, 0},
+           {15696, 2308, 474, 0, 0},    {22115, 8625, 3403, 0, 0},
+           {26232, 15278, 8785, 0, 0},  {27839, 19598, 12683, 0, 0},
+           {4631, 250, 53, 0, 0},       {14597, 1984, 361, 0, 0},
+           {21331, 7332, 2309, 0, 0},   {25516, 14234, 6592, 0, 0},
+           {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0},
+           {9751, 546, 67, 0, 0},       {17139, 3535, 722, 0, 0},
+           {23381, 10147, 3288, 0, 0},  {25846, 15152, 7758, 0, 0},
+           {3930, 503, 154, 0, 0},      {13067, 2562, 848, 0, 0},
+           {21554, 10358, 4835, 0, 0},  {27448, 18591, 9734, 0, 0},
+           {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0},
+           {11692, 1242, 207, 0, 0},    {20061, 6465, 1557, 0, 0},
+           {24599, 11046, 4549, 0, 0},  {26723, 13362, 5726, 0, 0},
+           {5015, 196, 23, 0, 0},       {11936, 890, 115, 0, 0},
+           {19518, 5412, 1094, 0, 0},   {25050, 11260, 2910, 0, 0},
+           {25559, 14418, 7209, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0},
+           {18239, 4809, 1317, 0, 0},   {24495, 11950, 5510, 0, 0},
+           {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0},
+           {4752, 194, 36, 0, 0},       {15297, 2462, 467, 0, 0},
+           {22544, 8705, 3040, 0, 0},   {26166, 14814, 7716, 0, 0},
+           {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0},
+           {15271, 2486, 498, 0, 0},    {22539, 9039, 3230, 0, 0},
+           {26424, 15557, 8328, 0, 0},  {28919, 21579, 15660, 0, 0},
+           {4198, 185, 42, 0, 0},       {15247, 2607, 530, 0, 0},
+           {22615, 9203, 3390, 0, 0},   {26313, 15427, 8325, 0, 0},
+           {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0},
+           {11222, 928, 158, 0, 0},     {19221, 5187, 1309, 0, 0},
+           {23856, 11011, 4459, 0, 0},  {27220, 17688, 10722, 0, 0},
+           {1985, 228, 83, 0, 0},       {15228, 3240, 1100, 0, 0},
+           {22608, 11300, 5985, 0, 0},  {28044, 19375, 12714, 0, 0},
+           {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0},
+           {11814, 1674, 431, 0, 0},    {20348, 7070, 2589, 0, 0},
+           {25464, 13448, 6520, 0, 0},  {28402, 20507, 13904, 0, 0},
+           {1187, 45, 20, 0, 0},        {11395, 1182, 243, 0, 0},
+           {20024, 6143, 1883, 0, 0},   {25337, 12446, 5818, 0, 0},
+           {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}},
+          {{24935, 14399, 8673, 0, 0},  {6118, 495, 66, 0, 0},
+           {16397, 2807, 577, 0, 0},    {21713, 8686, 3139, 0, 0},
+           {25876, 14124, 7368, 0, 0},  {27762, 19711, 13528, 0, 0},
+           {2934, 102, 20, 0, 0},       {13191, 1433, 198, 0, 0},
+           {20515, 6259, 1646, 0, 0},   {24777, 11996, 5057, 0, 0},
+           {27091, 16858, 9709, 0, 0},  {2659, 236, 48, 0, 0},
+           {16021, 2602, 516, 0, 0},    {22634, 9226, 3584, 0, 0},
+           {26977, 16592, 9212, 0, 0},  {28406, 22354, 15484, 0, 0},
+           {3276, 142, 20, 0, 0},       {12874, 1366, 243, 0, 0},
+           {19826, 5697, 1899, 0, 0},   {24422, 11552, 5363, 0, 0},
+           {26196, 15681, 8909, 0, 0},  {733, 33, 20, 0, 0},
+           {9811, 930, 150, 0, 0},      {18044, 4196, 996, 0, 0},
+           {22404, 8769, 3215, 0, 0},   {25764, 14335, 7113, 0, 0},
+           {5240, 491, 87, 0, 0},       {15809, 1597, 672, 0, 0},
+           {22282, 9175, 4806, 0, 0},   {24576, 16384, 9557, 0, 0},
+           {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0},
+           {17695, 3277, 1092, 0, 0},   {21504, 8192, 4096, 0, 0},
+           {30427, 14043, 9362, 0, 0},  {25486, 14564, 7282, 0, 0},
+           {4221, 555, 111, 0, 0},      {11980, 2995, 529, 0, 0},
+           {25988, 11299, 2260, 0, 0},  {26810, 17873, 8937, 0, 0},
+           {16384, 10923, 5461, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0},
+           {19312, 5606, 1681, 0, 0},   {24767, 12706, 6264, 0, 0},
+           {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0},
+           {5734, 424, 59, 0, 0},       {16918, 3353, 771, 0, 0},
+           {23274, 9992, 3927, 0, 0},   {26617, 15938, 8799, 0, 0},
+           {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0},
+           {17130, 3346, 823, 0, 0},    {23618, 10903, 4550, 0, 0},
+           {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0},
+           {4238, 182, 33, 0, 0},       {15629, 2470, 476, 0, 0},
+           {22568, 8729, 3083, 0, 0},   {26349, 15094, 7982, 0, 0},
+           {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0},
+           {12150, 1281, 224, 0, 0},    {19867, 5551, 1536, 0, 0},
+           {24144, 11034, 4597, 0, 0},  {27664, 18577, 12020, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{21562, 11678, 6207, 0, 0},  {4009, 489, 97, 0, 0},
+           {18597, 4816, 1199, 0, 0},   {23025, 9861, 3627, 0, 0},
+           {25897, 14882, 7900, 0, 0},  {27808, 19616, 13453, 0, 0},
+           {1691, 107, 20, 0, 0},       {13368, 1573, 253, 0, 0},
+           {20016, 5910, 1728, 0, 0},   {24398, 10670, 4177, 0, 0},
+           {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0},
+           {14908, 2111, 435, 0, 0},    {20258, 7956, 3507, 0, 0},
+           {26588, 13644, 8046, 0, 0},  {27727, 19220, 14809, 0, 0},
+           {1216, 52, 20, 0, 0},        {10860, 999, 145, 0, 0},
+           {18298, 4567, 1203, 0, 0},   {23275, 9786, 4160, 0, 0},
+           {25910, 15528, 8631, 0, 0},  {225, 16, 12, 0, 0},
+           {8482, 671, 102, 0, 0},      {16810, 3551, 744, 0, 0},
+           {22561, 8534, 2810, 0, 0},   {25839, 14463, 7116, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}}},
+         {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0},
+           {22564, 9972, 4477, 0, 0},   {26692, 16833, 10643, 0, 0},
+           {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0},
+           {6928, 752, 106, 0, 0},      {17659, 4500, 1237, 0, 0},
+           {23383, 10537, 4428, 0, 0},  {26686, 16096, 9289, 0, 0},
+           {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0},
+           {15932, 3216, 909, 0, 0},    {23212, 10226, 4412, 0, 0},
+           {26463, 16043, 9228, 0, 0},  {29392, 22873, 17584, 0, 0},
+           {3385, 151, 23, 0, 0},       {13877, 1959, 367, 0, 0},
+           {21080, 6826, 2081, 0, 0},   {25300, 13299, 6117, 0, 0},
+           {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0},
+           {11862, 1157, 168, 0, 0},    {19577, 5147, 1231, 0, 0},
+           {24000, 10739, 4092, 0, 0},  {27689, 18659, 11862, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0},  {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
+    [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
+    [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
+        {{{{18470, 12050, 8594, 0, 0},  {20232, 13167, 8979, 0, 0},
+           {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0},
+           {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0},
+           {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0},
+           {16790, 9550, 5950, 0, 0},   {20581, 13294, 8879, 0, 0},
+           {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0},
+           {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0},
+           {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0},
+           {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0},
+           {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0},
+           {29734, 26908, 24306, 0, 0}},
+          {{16801, 9863, 6482, 0, 0},   {19234, 12114, 8189, 0, 0},
+           {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0},
+           {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0},
+           {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0},
+           {13228, 6064, 3049, 0, 0},   {17610, 9799, 5671, 0, 0},
+           {21360, 13903, 9118, 0, 0},  {23883, 17320, 12518, 0, 0},
+           {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0},
+           {12945, 6278, 3612, 0, 0},   {13878, 6839, 3836, 0, 0},
+           {17108, 9277, 5335, 0, 0},   {20621, 12992, 8280, 0, 0},
+           {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0},
+           {27328, 22598, 18583, 0, 0}}},
+         {{{18362, 11906, 8354, 0, 0},  {20944, 13861, 9659, 0, 0},
+           {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0},
+           {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0},
+           {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0},
+           {17175, 9869, 6059, 0, 0},   {20666, 13400, 8957, 0, 0},
+           {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0},
+           {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0},
+           {16983, 11183, 8409, 0, 0},  {14421, 7539, 4502, 0, 0},
+           {17794, 10281, 6379, 0, 0},  {21345, 14087, 9497, 0, 0},
+           {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0},
+           {29061, 25732, 22786, 0, 0}},
+          {{17308, 11072, 7299, 0, 0},  {20598, 13519, 9577, 0, 0},
+           {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0},
+           {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0},
+           {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0},
+           {14845, 7479, 3976, 0, 0},   {18490, 10800, 6471, 0, 0},
+           {21858, 14632, 9818, 0, 0},  {24345, 17953, 13141, 0, 0},
+           {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0},
+           {12916, 6694, 4096, 0, 0},   {13397, 6658, 3779, 0, 0},
+           {16503, 8895, 5105, 0, 0},   {20010, 12390, 7816, 0, 0},
+           {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0},
+           {27563, 23023, 19146, 0, 0}}},
+         {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0},
+           {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0},
+           {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0},
+           {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0},
+           {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0},
+           {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0},
+           {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0},
+           {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0},
+           {16816, 9228, 5514, 0, 0},   {20359, 12834, 8338, 0, 0},
+           {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0},
+           {28630, 24974, 21807, 0, 0}},
+          {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0},
+           {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0},
+           {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0},
+           {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0},
+           {20784, 13660, 9648, 0, 0},  {22078, 15558, 11105, 0, 0},
+           {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0},
+           {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0},
+           {13032, 6121, 3627, 0, 0},   {13835, 6698, 3784, 0, 0},
+           {16989, 9720, 5568, 0, 0},   {20130, 12707, 8236, 0, 0},
+           {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0},
+           {27690, 23484, 20174, 0, 0}}},
+         {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0},
+           {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0},
+           {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0},
+           {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0},
+           {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0},
+           {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0},
+           {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0},
+           {14488, 8381, 4779, 0, 0},   {16916, 10097, 6583, 0, 0},
+           {18923, 11817, 7979, 0, 0},  {21713, 14802, 10639, 0, 0},
+           {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0},
+           {29398, 26375, 23755, 0, 0}},
+          {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0},
+           {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0},
+           {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0},
+           {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0},
+           {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0},
+           {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0},
+           {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0},
+           {17363, 11575, 7149, 0, 0},  {17077, 10816, 6207, 0, 0},
+           {19806, 13574, 8603, 0, 0},  {22496, 14913, 10639, 0, 0},
+           {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0},
+           {27898, 23132, 19563, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{17773, 11427, 8019, 0, 0},  {19610, 12479, 8167, 0, 0},
+           {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0},
+           {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0},
+           {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0},
+           {15962, 8606, 5235, 0, 0},   {19868, 12364, 8055, 0, 0},
+           {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0},
+           {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0},
+           {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0},
+           {20698, 13499, 9144, 0, 0},  {23815, 17362, 12662, 0, 0},
+           {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0},
+           {28975, 25490, 22321, 0, 0}},
+          {{17197, 10536, 7019, 0, 0},  {18262, 11193, 7394, 0, 0},
+           {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0},
+           {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0},
+           {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0},
+           {11993, 5102, 2478, 0, 0},   {16294, 8358, 4469, 0, 0},
+           {20297, 12588, 7781, 0, 0},  {23358, 16281, 11329, 0, 0},
+           {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0},
+           {11678, 5478, 3012, 0, 0},   {11972, 5366, 2742, 0, 0},
+           {14949, 7283, 3799, 0, 0},   {18908, 10859, 6306, 0, 0},
+           {21766, 14274, 9239, 0, 0},  {23815, 16839, 11871, 0, 0},
+           {26320, 20850, 16314, 0, 0}}},
+         {{{16769, 10560, 7319, 0, 0},  {19718, 12780, 8646, 0, 0},
+           {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0},
+           {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0},
+           {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0},
+           {15943, 8533, 5010, 0, 0},   {19895, 12366, 7958, 0, 0},
+           {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0},
+           {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0},
+           {17007, 12052, 9544, 0, 0},  {13450, 6779, 4009, 0, 0},
+           {17239, 9674, 5839, 0, 0},   {21106, 13779, 9127, 0, 0},
+           {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0},
+           {28520, 24709, 21328, 0, 0}},
+          {{17869, 11551, 8265, 0, 0},  {19249, 12485, 8721, 0, 0},
+           {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0},
+           {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0},
+           {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0},
+           {12533, 5622, 2846, 0, 0},   {16872, 9053, 5131, 0, 0},
+           {20928, 13418, 8637, 0, 0},  {23646, 16836, 11888, 0, 0},
+           {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0},
+           {11923, 6215, 3836, 0, 0},   {11787, 5396, 2884, 0, 0},
+           {14987, 7433, 3983, 0, 0},   {19008, 11060, 6471, 0, 0},
+           {21793, 14353, 9403, 0, 0},  {23723, 16979, 12082, 0, 0},
+           {26638, 21569, 17345, 0, 0}}},
+         {{{19219, 13044, 9610, 0, 0},  {20924, 14386, 10522, 0, 0},
+           {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0},
+           {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0},
+           {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0},
+           {17829, 10649, 6816, 0, 0},  {21405, 14361, 9956, 0, 0},
+           {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0},
+           {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0},
+           {12519, 7515, 5351, 0, 0},   {11698, 5250, 2767, 0, 0},
+           {15914, 8299, 4694, 0, 0},   {19904, 12282, 7768, 0, 0},
+           {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0},
+           {28274, 24289, 20862, 0, 0}},
+          {{18808, 13151, 9939, 0, 0},  {21618, 15427, 11540, 0, 0},
+           {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0},
+           {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0},
+           {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0},
+           {15800, 8182, 4738, 0, 0},   {19248, 11713, 7455, 0, 0},
+           {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0},
+           {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0},
+           {10648, 4941, 2535, 0, 0},   {12205, 5410, 2873, 0, 0},
+           {15692, 8124, 4615, 0, 0},   {19406, 11826, 7459, 0, 0},
+           {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0},
+           {27060, 22256, 18271, 0, 0}}},
+         {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0},
+           {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0},
+           {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0},
+           {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0},
+           {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0},
+           {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0},
+           {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0},
+           {14011, 7838, 4994, 0, 0},   {15120, 8172, 4951, 0, 0},
+           {18061, 10716, 6742, 0, 0},  {21048, 13916, 9476, 0, 0},
+           {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0},
+           {28889, 25435, 22440, 0, 0}},
+          {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0},
+           {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0},
+           {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0},
+           {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0},
+           {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0},
+           {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0},
+           {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0},
+           {11020, 4631, 2513, 0, 0},   {13332, 6187, 3208, 0, 0},
+           {16409, 8567, 4815, 0, 0},   {18807, 11075, 6897, 0, 0},
+           {21224, 14082, 9446, 0, 0},  {23396, 16306, 11816, 0, 0},
+           {26630, 21558, 17378, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{16630, 10545, 7259, 0, 0},  {17421, 10338, 6436, 0, 0},
+           {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0},
+           {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0},
+           {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0},
+           {14451, 7299, 4317, 0, 0},   {18850, 11117, 6926, 0, 0},
+           {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0},
+           {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0},
+           {17720, 11701, 8384, 0, 0},  {14566, 7422, 4215, 0, 0},
+           {18466, 10749, 6412, 0, 0},  {21929, 14629, 9602, 0, 0},
+           {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0},
+           {27355, 22433, 18270, 0, 0}},
+          {{15374, 8267, 4873, 0, 0},   {16879, 9348, 5583, 0, 0},
+           {21207, 13635, 8898, 0, 0},  {24483, 17956, 12924, 0, 0},
+           {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0},
+           {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0},
+           {11444, 4853, 2257, 0, 0},   {15441, 7432, 3771, 0, 0},
+           {19351, 11387, 6735, 0, 0},  {22636, 15343, 10430, 0, 0},
+           {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0},
+           {8652, 2988, 1318, 0, 0},    {8915, 3073, 1177, 0, 0},
+           {12683, 5154, 2340, 0, 0},   {17442, 8433, 4193, 0, 0},
+           {20954, 13296, 7958, 0, 0},  {22547, 14157, 8001, 0, 0},
+           {25079, 18210, 12447, 0, 0}}},
+         {{{16554, 10388, 6998, 0, 0},  {18555, 11464, 7473, 0, 0},
+           {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0},
+           {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0},
+           {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0},
+           {14364, 7166, 4042, 0, 0},   {18443, 10788, 6562, 0, 0},
+           {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0},
+           {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0},
+           {16236, 11137, 8293, 0, 0},  {12101, 5618, 3100, 0, 0},
+           {16040, 8258, 4593, 0, 0},   {19907, 12123, 7436, 0, 0},
+           {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0},
+           {27037, 22085, 17856, 0, 0}},
+          {{18335, 11613, 7830, 0, 0},  {18110, 11052, 7223, 0, 0},
+           {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0},
+           {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0},
+           {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0},
+           {12018, 5400, 2947, 0, 0},   {15874, 7940, 4195, 0, 0},
+           {19521, 11492, 7011, 0, 0},  {22730, 15503, 10205, 0, 0},
+           {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0},
+           {10741, 5242, 3054, 0, 0},   {9670, 3622, 1547, 0, 0},
+           {12882, 5427, 2496, 0, 0},   {17159, 9021, 4722, 0, 0},
+           {20775, 12703, 7829, 0, 0},  {23131, 14501, 9097, 0, 0},
+           {25143, 18967, 13624, 0, 0}}},
+         {{{18330, 11970, 8679, 0, 0},  {20147, 13565, 9671, 0, 0},
+           {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0},
+           {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0},
+           {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0},
+           {16231, 8743, 5183, 0, 0},   {19988, 12387, 7901, 0, 0},
+           {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0},
+           {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0},
+           {9949, 5346, 3566, 0, 0},    {10544, 4254, 2047, 0, 0},
+           {15108, 7335, 3855, 0, 0},   {19194, 11286, 6766, 0, 0},
+           {22139, 14791, 9830, 0, 0},  {24156, 17470, 12503, 0, 0},
+           {27161, 22277, 18172, 0, 0}},
+          {{19199, 12968, 9562, 0, 0},  {19640, 12844, 8899, 0, 0},
+           {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0},
+           {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0},
+           {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0},
+           {11855, 5018, 2629, 0, 0},   {16928, 8659, 4934, 0, 0},
+           {20460, 12739, 8199, 0, 0},  {22552, 15983, 11310, 0, 0},
+           {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0},
+           {9602, 3867, 1770, 0, 0},    {10869, 4363, 2017, 0, 0},
+           {14355, 6677, 3325, 0, 0},   {17535, 9654, 5416, 0, 0},
+           {20085, 12296, 7480, 0, 0},  {22066, 14509, 9359, 0, 0},
+           {24643, 18304, 13542, 0, 0}}},
+         {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0},
+           {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0},
+           {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0},
+           {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0},
+           {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0},
+           {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0},
+           {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0},
+           {11561, 5522, 3128, 0, 0},   {13221, 6190, 3271, 0, 0},
+           {16599, 8897, 5078, 0, 0},   {19948, 12310, 7750, 0, 0},
+           {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0},
+           {27731, 23358, 19650, 0, 0}},
+          {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0},
+           {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0},
+           {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0},
+           {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0},
+           {17533, 9890, 6623, 0, 0},   {19783, 12810, 8613, 0, 0},
+           {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0},
+           {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0},
+           {7112, 2166, 874, 0, 0},     {10198, 3661, 1676, 0, 0},
+           {13851, 6345, 3227, 0, 0},   {16828, 9119, 5014, 0, 0},
+           {19965, 12187, 7549, 0, 0},  {21686, 14073, 9392, 0, 0},
+           {24829, 18395, 13763, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}},
+        {{{{14453, 8479, 5217, 0, 0},   {15914, 8700, 4933, 0, 0},
+           {22628, 14841, 9595, 0, 0},  {26046, 19786, 14501, 0, 0},
+           {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0},
+           {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0},
+           {12035, 5088, 2460, 0, 0},   {16736, 8307, 4222, 0, 0},
+           {21115, 12675, 7687, 0, 0},  {23478, 16339, 10682, 0, 0},
+           {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0},
+           {11087, 5036, 2448, 0, 0},   {10379, 3724, 1507, 0, 0},
+           {13741, 6037, 2681, 0, 0},   {18029, 9013, 4144, 0, 0},
+           {21410, 11990, 7257, 0, 0},  {21773, 14695, 8578, 0, 0},
+           {23606, 17778, 12151, 0, 0}},
+          {{11343, 4816, 2380, 0, 0},   {14706, 6930, 3734, 0, 0},
+           {20812, 12887, 7960, 0, 0},  {25050, 17768, 11788, 0, 0},
+           {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0},
+           {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0},
+           {8371, 2872, 932, 0, 0},     {13523, 5640, 2175, 0, 0},
+           {19566, 12943, 6364, 0, 0},  {21190, 13471, 8811, 0, 0},
+           {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0},
+           {5401, 2247, 834, 0, 0},     {7864, 2097, 828, 0, 0},
+           {9693, 4308, 1469, 0, 0},    {18368, 9110, 2351, 0, 0},
+           {18883, 8886, 4443, 0, 0},   {18022, 9830, 4915, 0, 0},
+           {27307, 16384, 5461, 0, 0}}},
+         {{{14494, 7955, 4878, 0, 0},   {17231, 9619, 5765, 0, 0},
+           {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0},
+           {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0},
+           {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0},
+           {12372, 5291, 2620, 0, 0},   {16195, 8139, 4276, 0, 0},
+           {20019, 11922, 7094, 0, 0},  {22535, 14890, 9950, 0, 0},
+           {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0},
+           {12302, 6257, 3482, 0, 0},   {9709, 3594, 1577, 0, 0},
+           {13287, 5505, 2527, 0, 0},   {17310, 9137, 4631, 0, 0},
+           {20352, 12160, 7075, 0, 0},  {22507, 14757, 9507, 0, 0},
+           {24752, 18113, 13102, 0, 0}},
+          {{15152, 8182, 4656, 0, 0},   {16959, 9469, 5613, 0, 0},
+           {22001, 13878, 8975, 0, 0},  {25041, 18513, 13903, 0, 0},
+           {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0},
+           {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0},
+           {10044, 3831, 1807, 0, 0},   {14301, 6444, 3188, 0, 0},
+           {19534, 12055, 7119, 0, 0},  {21587, 15176, 10287, 0, 0},
+           {24477, 14410, 8192, 0, 0},  {25200, 20887, 17784, 0, 0},
+           {7820, 3767, 1621, 0, 0},    {7094, 2149, 617, 0, 0},
+           {11927, 5975, 3165, 0, 0},   {18099, 8412, 4102, 0, 0},
+           {21434, 9175, 4549, 0, 0},   {23846, 18006, 9895, 0, 0},
+           {24467, 19224, 12233, 0, 0}}},
+         {{{15655, 9035, 5687, 0, 0},   {18629, 11362, 7316, 0, 0},
+           {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0},
+           {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0},
+           {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0},
+           {13478, 6058, 3154, 0, 0},   {17832, 9777, 5584, 0, 0},
+           {21530, 13817, 9006, 0, 0},  {23982, 17151, 12180, 0, 0},
+           {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0},
+           {10129, 4546, 2558, 0, 0},   {9552, 3437, 1461, 0, 0},
+           {13693, 6006, 2873, 0, 0},   {17754, 9655, 5311, 0, 0},
+           {20830, 12911, 8016, 0, 0},  {22826, 15488, 10486, 0, 0},
+           {25601, 19624, 15016, 0, 0}},
+          {{16948, 10030, 6280, 0, 0},  {19238, 11883, 7552, 0, 0},
+           {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0},
+           {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0},
+           {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0},
+           {10650, 4710, 2399, 0, 0},   {16270, 8000, 4379, 0, 0},
+           {19848, 11593, 6631, 0, 0},  {22038, 14149, 7416, 0, 0},
+           {22581, 16489, 9977, 0, 0},  {23458, 18137, 10641, 0, 0},
+           {7798, 2210, 711, 0, 0},     {7967, 2826, 1070, 0, 0},
+           {10336, 4315, 1913, 0, 0},   {13714, 7088, 3188, 0, 0},
+           {18376, 9732, 4659, 0, 0},   {20273, 11821, 6118, 0, 0},
+           {20326, 12442, 6554, 0, 0}}},
+         {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0},
+           {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0},
+           {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0},
+           {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0},
+           {17070, 9491, 5600, 0, 0},   {20042, 12400, 7721, 0, 0},
+           {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0},
+           {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0},
+           {9519, 3864, 1821, 0, 0},    {11718, 4860, 2256, 0, 0},
+           {15328, 7428, 3819, 0, 0},   {18709, 10750, 6227, 0, 0},
+           {21480, 13865, 8870, 0, 0},  {23357, 16426, 11340, 0, 0},
+           {26490, 21180, 16824, 0, 0}},
+          {{18787, 12701, 9542, 0, 0},  {15846, 9188, 5985, 0, 0},
+           {21763, 13729, 8281, 0, 0},  {25379, 18550, 12970, 0, 0},
+           {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0},
+           {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0},
+           {11506, 4141, 1640, 0, 0},   {14376, 6314, 2331, 0, 0},
+           {17898, 9858, 5672, 0, 0},   {20148, 13284, 7860, 0, 0},
+           {23478, 16215, 9966, 0, 0},  {26100, 18480, 12764, 0, 0},
+           {5064, 1713, 819, 0, 0},     {8059, 2790, 980, 0, 0},
+           {11100, 3504, 1111, 0, 0},   {14473, 5800, 2694, 0, 0},
+           {16369, 8346, 3455, 0, 0},   {18421, 9742, 4664, 0, 0},
+           {20398, 12962, 8291, 0, 0}}},
+         {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}},
+          {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
+           {24576, 16384, 8192, 0, 0}}}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                                [kDcSignContexts][kBooleanFieldCdfSize] = {
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}},
+  {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
+    {15488, 0, 0}}}
+};
+/* clang-format on */
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+                                                                   0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
+                          [kBooleanFieldCdfSize] = {
+                              {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
+                              {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}},
+                              {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}},
+                              {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}},
+                              {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}},
+                              {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
+                              {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
+                           [kPaletteSizeSymbolCount + 1] = {
+                               {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
+                               {25629, 21347, 16573, 13224, 9102, 4695, 0, 0},
+                               {24980, 20027, 15443, 12268, 8453, 4238, 0, 0},
+                               {24497, 18704, 14522, 11204, 7697, 4235, 0, 0},
+                               {20043, 13588, 10905, 7929, 5233, 2648, 0, 0},
+                               {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
+                               {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+        {307, 0, 0}, {11280, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
+                            [kPaletteSizeSymbolCount + 1] = {
+                                {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
+                                {26929, 17195, 9187, 5821, 2920, 1068, 0, 0},
+                                {28342, 21508, 14769, 11285, 6905, 3338, 0, 0},
+                                {29540, 23304, 17775, 14679, 10245, 5348, 0, 0},
+                                {29000, 23882, 19677, 14916, 10273, 5561, 0, 0},
+                                {30304, 24317, 19907, 11136, 7243, 4213, 0, 0},
+                                {31499, 27333, 22335, 13805, 11068, 6903, 0,
+                                 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
+    [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
+    [kPaletteColorIndexSymbolCount + 1] = {
+        {{{4058, 0, 0},
+          {16384, 0, 0},
+          {22215, 0, 0},
+          {5732, 0, 0},
+          {1165, 0, 0}},
+         {{4891, 2278, 0, 0},
+          {21236, 7071, 0, 0},
+          {26224, 2534, 0, 0},
+          {9750, 4696, 0, 0},
+          {853, 383, 0, 0}},
+         {{7196, 4722, 2723, 0, 0},
+          {23290, 11178, 5512, 0, 0},
+          {25520, 5931, 2944, 0, 0},
+          {13601, 8282, 4419, 0, 0},
+          {1368, 943, 518, 0, 0}},
+         {{7989, 5813, 4192, 2486, 0, 0},
+          {24099, 12404, 8695, 4675, 0, 0},
+          {28513, 5203, 3391, 1701, 0, 0},
+          {12904, 9094, 6052, 3238, 0, 0},
+          {1122, 875, 621, 342, 0, 0}},
+         {{9636, 7361, 5798, 4333, 2695, 0, 0},
+          {25325, 15526, 12051, 8006, 4786, 0, 0},
+          {26468, 7906, 5824, 3984, 2097, 0, 0},
+          {13852, 9873, 7501, 5333, 3116, 0, 0},
+          {1498, 1218, 960, 709, 415, 0, 0}},
+         {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0},
+          {25818, 17321, 13816, 10087, 7201, 4205, 0, 0},
+          {25208, 9294, 7278, 5565, 3847, 2060, 0, 0},
+          {14224, 10395, 8311, 6573, 4649, 2723, 0, 0},
+          {1570, 1317, 1098, 886, 645, 377, 0, 0}},
+         {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0},
+          {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0},
+          {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0},
+          {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0},
+          {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}},
+        {{{3679, 0, 0},
+          {16384, 0, 0},
+          {24055, 0, 0},
+          {3511, 0, 0},
+          {1158, 0, 0}},
+         {{7511, 3623, 0, 0},
+          {20481, 5475, 0, 0},
+          {25735, 4808, 0, 0},
+          {12623, 7363, 0, 0},
+          {2160, 1129, 0, 0}},
+         {{8558, 5593, 2865, 0, 0},
+          {22880, 10382, 5554, 0, 0},
+          {26867, 6715, 3475, 0, 0},
+          {14450, 10616, 4435, 0, 0},
+          {2309, 1632, 842, 0, 0}},
+         {{9788, 7289, 4987, 2782, 0, 0},
+          {24355, 11360, 7909, 3894, 0, 0},
+          {30511, 3319, 2174, 1170, 0, 0},
+          {13579, 11566, 6853, 4148, 0, 0},
+          {924, 724, 487, 250, 0, 0}},
+         {{10551, 8201, 6131, 4085, 2220, 0, 0},
+          {25461, 16362, 13132, 8136, 4344, 0, 0},
+          {28327, 7704, 5889, 3826, 1849, 0, 0},
+          {15558, 12240, 9449, 6018, 3186, 0, 0},
+          {2094, 1815, 1372, 1033, 561, 0, 0}},
+         {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0},
+          {26223, 17756, 14764, 10951, 7265, 4067, 0, 0},
+          {29320, 6473, 5331, 4064, 2642, 1326, 0, 0},
+          {16879, 14445, 11064, 8070, 5792, 3078, 0, 0},
+          {1780, 1564, 1289, 1034, 785, 443, 0, 0}},
+         {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0},
+          {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0},
+          {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0},
+          {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
+          {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+        {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
+                                   [kBooleanFieldCdfSize] = {{5940, 0, 0},
+                                                             {8733, 0, 0},
+                                                             {20737, 0, 0},
+                                                             {22128, 0, 0},
+                                                             {29867, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
+                                    [kBooleanFieldCdfSize] = {{31570, 0, 0},
+                                                              {30698, 0, 0},
+                                                              {23602, 0, 0},
+                                                              {25269, 0, 0},
+                                                              {10293, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
+    [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
+        {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
+          {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
+          {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}},
+         {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}},
+          {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
+          {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
+                                        [kBooleanFieldCdfSize] = {
+                                            {{30533, 0, 0}, {31345, 0, 0}},
+                                            {{15586, 0, 0}, {17593, 0, 0}},
+                                            {{2162, 0, 0}, {2279, 0, 0}}};
+
+/* clang-format off */
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+                                         [kBooleanFieldCdfSize] = {
+  {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
+   {31324, 0, 0}},
+  {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0},
+   {17681, 0, 0}},
+  {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0},
+   {2464, 0, 0}}};
+/* clang-format on */
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
+    [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
+        {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
+        {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
+        {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0},
+        {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0},
+        {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0},
+        {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0},
+        {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
+        {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+        {8733, 0, 0},  {16138, 0, 0}, {17429, 0, 0},
+        {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+                                                                {31714, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
+        {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
+        {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
+
+// This is called drl_mode in the spec where DRL stands for Dynamic Reference
+// List.
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
+        {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
+        {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+        {30893, 21686, 5436, 0, 0},
+        {30295, 22772, 6380, 0, 0},
+        {28530, 21231, 6842, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {12732, 0, 0}, {7811, 0, 0},  {16384, 0, 0}, {16384, 0, 0},
+        {6064, 0, 0},  {5238, 0, 0},  {3204, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {3324, 0, 0},  {5896, 0, 0},  {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {16384, 0, 0}, {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406,
+         8641, 7066, 5016, 3318, 1597, 0, 0},
+        {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401,
+         14316, 13346, 9929, 6641, 3139, 0, 0},
+        {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484,
+         7783, 7084, 5509, 3885, 1857, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436,
+         8248, 5298, 3312, 2239, 1112, 0, 0},
+        {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807,
+         9884, 8297, 6049, 4054, 1891, 0, 0},
+        {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038,
+         14654, 13455, 10247, 6756, 3218, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711,
+         5517, 3595, 2679, 1808, 835, 0, 0},
+        {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235,
+         9334, 6796, 4824, 3198, 1352, 0, 0},
+        {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730,
+         8805, 7457, 5780, 4002, 1756, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0},
+        {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
+         10240, 8192, 6144, 4096, 2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0},  {16384, 0, 0},
+        {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+        {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},  {9945, 0, 0},
+        {5889, 0, 0},  {10685, 0, 0}, {2640, 0, 0},  {1754, 0, 0},
+        {1208, 0, 0},  {130, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+        {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+        {21845, 10923, 0, 0}, {25117, 8008, 0, 0},  {28030, 8003, 0, 0},
+        {3969, 1378, 0, 0},   {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+        {13349, 5958, 0, 0},  {27645, 9162, 0, 0},  {3795, 1174, 0, 0},
+        {6337, 1994, 0, 0},   {21162, 8460, 0, 0},  {6508, 3652, 0, 0},
+        {12408, 4706, 0, 0},  {3026, 1565, 0, 0},   {11089, 5938, 0, 0},
+        {3252, 2067, 0, 0},   {3870, 2371, 0, 0},   {1890, 1433, 0, 0},
+        {261, 210, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
+                                     [kBooleanFieldCdfSize] = {
+                                         {6161, 0, 0},  {9877, 0, 0},
+                                         {13928, 0, 0}, {8174, 0, 0},
+                                         {12834, 0, 0}, {10094, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
+                                    [kBooleanFieldCdfSize] = {
+                                        {14524, 0, 0}, {19903, 0, 0},
+                                        {25715, 0, 0}, {19509, 0, 0},
+                                        {23434, 0, 0}, {28124, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundTypeCdf[kMaxBlockSizes]
+                           [kNumExplicitCompoundPredictionTypes + 1] = {
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {9337, 0, 0},  {19597, 0, 0},
+                               {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+                               {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+                               {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
+    [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
+        {833, 48, 0, 0},      {27200, 49, 0, 0},    {32346, 29830, 0, 0},
+        {4524, 160, 0, 0},    {1562, 815, 0, 0},    {27906, 647, 0, 0},
+        {31998, 31616, 0, 0}, {11879, 7131, 0, 0},  {858, 44, 0, 0},
+        {28648, 56, 0, 0},    {32463, 30521, 0, 0}, {5365, 132, 0, 0},
+        {1746, 759, 0, 0},    {29805, 675, 0, 0},   {32167, 31825, 0, 0},
+        {17799, 11370, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+        4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+                                                     1] = {
+        {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+        {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+        {10240, 0, 0}, {8192, 0, 0},  {4096, 0, 0},  {2816, 0, 0},
+        {2816, 0, 0},  {2048, 0, 0}};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+                                                         0};
+
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
diff --git a/src/symbol_decoder_context_test.cc b/src/symbol_decoder_context_test.cc
new file mode 100644
index 0000000..4a0de86
--- /dev/null
+++ b/src/symbol_decoder_context_test.cc
@@ -0,0 +1,264 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/symbol_decoder_context.h"
+
+#include <cstdint>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SymbolDecoderContextTest, ResetIntraFrameYModeCdf) {
+  // Note these are zero-initialized separately to avoid differences in padding
+  // values added to tables for alignment purposes when comparing the contexts
+  // with memcmp().
+  libgav1::SymbolDecoderContext gold_context = {};
+  libgav1::SymbolDecoderContext context = {};
+  gold_context.Initialize(0);
+  context.Initialize(0);
+  EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+  EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][0], 32768 - 15588);
+  EXPECT_EQ(context.intra_frame_y_mode_cdf[0][0][1], 32768 - 17027);
+  ++context.intra_frame_y_mode_cdf[0][0][0];
+  --context.intra_frame_y_mode_cdf[0][0][1];
+  EXPECT_NE(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+  context.ResetIntraFrameYModeCdf();
+  EXPECT_EQ(memcmp(&gold_context, &context, sizeof(gold_context)), 0);
+}
+
+void ResetAndVerifyCounters(libgav1::SymbolDecoderContext* const context) {
+  libgav1::SymbolDecoderContext gold_context = {};
+  gold_context.Initialize(0);
+  EXPECT_NE(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+  context->ResetCounters();
+  EXPECT_EQ(memcmp(&gold_context, context, sizeof(gold_context)), 0);
+}
+
+TEST(SymbolDecoderContextTest, ResetCounters1d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  context.delta_q_cdf[libgav1::kDeltaSymbolCount] = ++value;
+  context.delta_lf_cdf[libgav1::kDeltaSymbolCount] = ++value;
+  context.intra_block_copy_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  context.cfl_alpha_signs_cdf[libgav1::kCflAlphaSignsSymbolCount] = ++value;
+  context.filter_intra_mode_cdf[libgav1::kNumFilterIntraPredictors] = ++value;
+  context.restoration_type_cdf[libgav1::kRestorationTypeSymbolCount] = ++value;
+  context.use_wiener_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  context.use_sgrproj_cdf[libgav1::kBooleanSymbolCount] = ++value;
+  ResetAndVerifyCounters(&context);
+}
+
+void IncreasePartitionCounters(SymbolDecoderContext* symbol_context,
+                               int value) {
+  const int min_bsize_log2 = k4x4WidthLog2[kBlock8x8];
+  const int max_bsize_log2 = k4x4WidthLog2[kBlock128x128];
+  for (int block_size_log2 = min_bsize_log2; block_size_log2 <= max_bsize_log2;
+       ++block_size_log2) {
+    for (int context = 0; context < kPartitionContexts; ++context) {
+      const int cdf_size =
+          SymbolDecoderContext::PartitionCdfSize(block_size_log2);
+      symbol_context->partition_cdf[block_size_log2 - min_bsize_log2][context]
+                                   [cdf_size] += value;
+    }
+  }
+}
+
+void IncreasePaletteColorIndexCounters(SymbolDecoderContext* symbol_context,
+                                       int value) {
+  for (auto& palette_color_index_cdf_plane :
+       symbol_context->palette_color_index_cdf) {
+    for (int symbol_count = 0; symbol_count < kPaletteSizeSymbolCount;
+         ++symbol_count) {
+      const int cdf_size = symbol_count + kMinPaletteSize;
+      for (int context = 0; context < kPaletteColorIndexContexts; ++context) {
+        palette_color_index_cdf_plane[symbol_count][context][cdf_size] += value;
+      }
+    }
+  }
+}
+
+void IncreaseTxTypeCounters(SymbolDecoderContext* context, int value) {
+  for (int set_idx = kTransformSetIntra1; set_idx <= kTransformSetIntra2;
+       ++set_idx) {
+    auto tx_set = static_cast<TransformSet>(set_idx);
+    for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+      for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+        context->intra_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(
+            tx_set)][tx_size][mode][kNumTransformTypesInSet[tx_set]] += value;
+      }
+    }
+  }
+
+  for (int set_idx = kTransformSetInter1; set_idx <= kTransformSetInter3;
+       ++set_idx) {
+    auto tx_set = static_cast<TransformSet>(set_idx);
+    for (int tx_size = 0; tx_size < kNumExtendedTransformSizes; ++tx_size) {
+      context->inter_tx_type_cdf[SymbolDecoderContext::TxTypeIndex(tx_set)]
+                                [tx_size][kNumTransformTypesInSet[tx_set]] +=
+          value;
+    }
+  }
+}
+
+void IncreaseTxDepthCounters(SymbolDecoderContext* symbol_context, int value) {
+  for (int context = 0; context < kTxDepthContexts; ++context) {
+    symbol_context->tx_depth_cdf[0][context][kMaxTxDepthSymbolCount - 1] +=
+        value;
+  }
+
+  for (int plane_category = 1; plane_category < 4; ++plane_category) {
+    for (int context = 0; context < kTxDepthContexts; ++context) {
+      symbol_context
+          ->tx_depth_cdf[plane_category][context][kMaxTxDepthSymbolCount] +=
+          value;
+    }
+  }
+}
+
+void IncreaseUVModeCounters(SymbolDecoderContext* symbol_context, int value) {
+  for (int cfl_allowed = 0; cfl_allowed < kBooleanSymbolCount; ++cfl_allowed) {
+    for (int mode = 0; mode < kIntraPredictionModesY; ++mode) {
+      symbol_context->uv_mode_cdf[cfl_allowed][mode][kIntraPredictionModesUV -
+                                                     (1 - cfl_allowed)] +=
+          value;
+    }
+  }
+}
+
+#define ASSIGN_COUNTER_2D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      d1[libgav1::offset] = ++value;     \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters2d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_2D(segment_id_cdf, kMaxSegments);
+  ASSIGN_COUNTER_2D(use_predicted_segment_id_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(skip_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(skip_mode_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(delta_lf_multi_cdf, kDeltaSymbolCount);
+  ASSIGN_COUNTER_2D(y_mode_cdf, kIntraPredictionModesY);
+  ASSIGN_COUNTER_2D(angle_delta_cdf, kAngleDeltaSymbolCount);
+  ASSIGN_COUNTER_2D(cfl_alpha_cdf, kCflAlphaSymbolCount);
+  ASSIGN_COUNTER_2D(use_filter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(tx_split_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(eob_pt_512_cdf, kEobPt512SymbolCount);
+  ASSIGN_COUNTER_2D(eob_pt_1024_cdf, kEobPt1024SymbolCount);
+  ASSIGN_COUNTER_2D(palette_y_size_cdf, kPaletteSizeSymbolCount);
+  ASSIGN_COUNTER_2D(has_palette_uv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(palette_uv_size_cdf, kPaletteSizeSymbolCount);
+  ASSIGN_COUNTER_2D(is_inter_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(use_compound_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_reference_type_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_prediction_mode_cdf,
+                    kNumCompoundInterPredictionModes);
+  ASSIGN_COUNTER_2D(new_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(zero_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(reference_mv_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(ref_mv_index_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(is_inter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(inter_intra_mode_cdf, kNumInterIntraModes);
+  ASSIGN_COUNTER_2D(is_wedge_inter_intra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(wedge_index_cdf, kWedgeIndexSymbolCount);
+  ASSIGN_COUNTER_2D(use_obmc_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(motion_mode_cdf, kNumMotionModes);
+  ASSIGN_COUNTER_2D(is_explicit_compound_type_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(is_compound_type_average_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_2D(compound_type_cdf, kNumExplicitCompoundPredictionTypes);
+  ASSIGN_COUNTER_2D(interpolation_filter_cdf, kNumExplicitInterpolationFilters);
+  ASSIGN_COUNTER_2D(mv_joint_cdf, kNumMvJointTypes);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_2D
+
+#define ASSIGN_COUNTER_3D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      for (auto& d2 : d1) {              \
+        d2[libgav1::offset] = ++value;   \
+      }                                  \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters3d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_3D(intra_frame_y_mode_cdf, kIntraPredictionModesY);
+  ASSIGN_COUNTER_3D(all_zero_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_16_cdf, kEobPt16SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_32_cdf, kEobPt32SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_64_cdf, kEobPt64SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_128_cdf, kEobPt128SymbolCount);
+  ASSIGN_COUNTER_3D(eob_pt_256_cdf, kEobPt256SymbolCount);
+  ASSIGN_COUNTER_3D(dc_sign_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(has_palette_y_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(compound_backward_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(single_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_sign_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class_cdf, kMvClassSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class0_bit_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_class0_high_precision_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_3D(mv_fraction_cdf, kMvFractionSymbolCount);
+  ASSIGN_COUNTER_3D(mv_high_precision_cdf, kBooleanSymbolCount);
+  IncreasePartitionCounters(&context, value);
+  IncreaseTxTypeCounters(&context, value);
+  IncreaseTxDepthCounters(&context, value);
+  IncreaseUVModeCounters(&context, value);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_3D
+
+#define ASSIGN_COUNTER_4D(array, offset) \
+  do {                                   \
+    for (auto& d1 : context.array) {     \
+      for (auto& d2 : d1) {              \
+        for (auto& d3 : d2) {            \
+          d3[libgav1::offset] = ++value; \
+        }                                \
+      }                                  \
+    }                                    \
+  } while (false)
+
+TEST(SymbolDecoderContextTest, ResetCounters4d) {
+  libgav1::SymbolDecoderContext context = {};
+  context.Initialize(0);
+  int value = 0;
+  ASSIGN_COUNTER_4D(eob_extra_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_eob_cdf, kCoeffBaseEobSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_cdf, kCoeffBaseSymbolCount);
+  ASSIGN_COUNTER_4D(coeff_base_range_cdf, kCoeffBaseRangeSymbolCount);
+  ASSIGN_COUNTER_4D(compound_reference_cdf, kBooleanSymbolCount);
+  ASSIGN_COUNTER_4D(mv_class0_fraction_cdf, kMvFractionSymbolCount);
+  ASSIGN_COUNTER_4D(mv_bit_cdf, kBooleanSymbolCount);
+  IncreasePaletteColorIndexCounters(&context, value);
+  IncreaseTxTypeCounters(&context, value);
+  ResetAndVerifyCounters(&context);
+}
+
+#undef ASSIGN_COUNTER_4D
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc
new file mode 100644
index 0000000..17ce18f
--- /dev/null
+++ b/src/threading_strategy.cc
@@ -0,0 +1,223 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/vector.h"
+
+namespace libgav1 {
+namespace {
+
+#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
+constexpr int kFrameParallelThresholdMultiplier = 3;
+#else
+constexpr int kFrameParallelThresholdMultiplier =
+    LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
+#endif
+
+// Computes the number of frame threads to be used based on the following
+// heuristic:
+//   * If |thread_count| == 1, return 0.
+//   * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+//     return 0.
+//   * Otherwise, return the largest value of i which satisfies the following
+//     condition: i + i * tile_columns <= thread_count. This ensures that there
+//     are at least |tile_columns| worker threads for each frame thread.
+//   * This function will never return 1 or a value > |thread_count|.
+//
+//  This heuristic is based on empirical performance data. The in-frame
+//  threading model (combination of tile multithreading, superblock row
+//  multithreading and post filter multithreading) performs better than the
+//  frame parallel model until we reach the threshold of |thread_count| >
+//  |tile_count| * kFrameParallelThresholdMultiplier.
+//
+//  It is a function of |tile_count| since tile threading and superblock row
+//  multithreading will scale only as a factor of |tile_count|. The threshold
+//  kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+//  The general idea is that superblock row multithreading plateaus at 4 *
+//  |tile_count| because in most practical cases there aren't more than that
+//  many superblock rows and columns available to work on in parallel.
+int ComputeFrameThreadCount(int thread_count, int tile_count,
+                            int tile_columns) {
+  assert(thread_count > 0);
+  if (thread_count == 1) return 0;
+  return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
+             ? 0
+             : std::max(2, thread_count / (1 + tile_columns));
+}
+
+}  // namespace
+
+bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
+                              int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = false;
+
+  if (thread_count == 1) {
+    thread_pool_.reset(nullptr);
+    tile_thread_count_ = 0;
+    max_tile_index_for_row_threads_ = 0;
+    return true;
+  }
+
+  // We do work in the current thread, so it is sufficient to create
+  // |thread_count|-1 threads in the threadpool.
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      tile_thread_count_ = 0;
+      max_tile_index_for_row_threads_ = 0;
+      return false;
+    }
+  }
+
+  // Prefer tile threads first (but only if there is more than one tile).
+  const int tile_count = frame_header.tile_info.tile_count;
+  if (tile_count > 1) {
+    // We want 1 + tile_thread_count_ <= tile_count because the current thread
+    // is also used to decode tiles. This is equivalent to
+    // tile_thread_count_ <= tile_count - 1.
+    tile_thread_count_ = std::min(thread_count, tile_count - 1);
+    thread_count -= tile_thread_count_;
+    if (thread_count == 0) {
+      max_tile_index_for_row_threads_ = 0;
+      return true;
+    }
+  } else {
+    tile_thread_count_ = 0;
+  }
+
+#if defined(__ANDROID__)
+  // Assign the remaining threads for each Tile. The heuristic used here is that
+  // we will assign two threads for each Tile. So for example, if |thread_count|
+  // is 2, for a stream with 2 tiles the first tile would get both the threads
+  // and the second tile would have row multi-threading turned off. This
+  // heuristic is based on the fact that row multi-threading is fast enough only
+  // when there are at least two threads to do the decoding (since one thread
+  // always does the parsing).
+  //
+  // This heuristic might stop working when SIMD optimizations make the decoding
+  // much faster and the parsing thread is only as fast as the decoding threads.
+  // So we will have to revisit this later to make sure that this is still
+  // optimal.
+  //
+  // Note that while this heuristic significantly improves performance on high
+  // end devices (like the Pixel 3), there are some performance regressions in
+  // some lower end devices (in some cases) and that needs to be revisited as we
+  // bring in more optimizations. Overall, the gains because of this heuristic
+  // seems to be much larger than the regressions.
+  for (int i = 0; i < tile_count; ++i) {
+    max_tile_index_for_row_threads_ = i + 1;
+    thread_count -= 2;
+    if (thread_count <= 0) break;
+  }
+#else   // !defined(__ANDROID__)
+  // Assign the remaining threads to each Tile.
+  for (int i = 0; i < tile_count; ++i) {
+    const int count = thread_count / tile_count +
+                      static_cast<int>(i < thread_count % tile_count);
+    if (count == 0) {
+      // Once we see a 0 value, all subsequent values will be 0 since it is
+      // supposed to be assigned in a round-robin fashion.
+      break;
+    }
+    max_tile_index_for_row_threads_ = i + 1;
+  }
+#endif  // defined(__ANDROID__)
+  return true;
+}
+
+bool ThreadingStrategy::Reset(int thread_count) {
+  assert(thread_count > 0);
+  frame_parallel_ = true;
+
+  // In frame parallel mode, we simply access the underlying |thread_pool_|
+  // directly. So ensure all the other threadpool getter functions return
+  // nullptr. Also, superblock row multithreading is always disabled in frame
+  // parallel mode.
+  tile_thread_count_ = 0;
+  max_tile_index_for_row_threads_ = 0;
+
+  if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
+    thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
+    if (thread_pool_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
+                   thread_count);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* const frame_thread_pool,
+    FrameScratchBufferPool* const frame_scratch_buffer_pool) {
+  assert(*frame_thread_pool == nullptr);
+  thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
+  const int frame_threads =
+      ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
+  if (frame_threads == 0) return true;
+  *frame_thread_pool = ThreadPool::Create(frame_threads);
+  if (*frame_thread_pool == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
+                 frame_threads);
+    return false;
+  }
+  int remaining_threads = thread_count - frame_threads;
+  if (remaining_threads == 0) return true;
+  int threads_per_frame = remaining_threads / frame_threads;
+  const int extra_threads = remaining_threads % frame_threads;
+  Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  if (!frame_scratch_buffers.reserve(frame_threads)) return false;
+  // Create the tile thread pools.
+  for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
+    std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+        frame_scratch_buffer_pool->Get();
+    if (frame_scratch_buffer == nullptr) {
+      return false;
+    }
+    // If the number of tile threads cannot be divided equally amongst all the
+    // frame threads, assign one extra thread to the first |extra_threads| frame
+    // threads.
+    const int current_frame_thread_count =
+        threads_per_frame + static_cast<int>(i < extra_threads);
+    if (!frame_scratch_buffer->threading_strategy.Reset(
+            current_frame_thread_count)) {
+      return false;
+    }
+    remaining_threads -= current_frame_thread_count;
+    frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
+  }
+  // We release the frame scratch buffers in reverse order so that the extra
+  // threads are allocated to buffers in the top of the stack.
+  for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
+       --i) {
+    frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/threading_strategy.h b/src/threading_strategy.h
new file mode 100644
index 0000000..84b3589
--- /dev/null
+++ b/src/threading_strategy.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_
+#define LIBGAV1_SRC_THREADING_STRATEGY_H_
+
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+
+class FrameScratchBufferPool;
+
+// This class allocates and manages the worker threads among thread pools used
+// for multi-threaded decoding.
+class ThreadingStrategy {
+ public:
+  ThreadingStrategy() = default;
+
+  // Not copyable or movable.
+  ThreadingStrategy(const ThreadingStrategy&) = delete;
+  ThreadingStrategy& operator=(const ThreadingStrategy&) = delete;
+
+  // Creates or re-allocates the thread pools based on the |frame_header| and
+  // |thread_count|. This function is used only in non frame-parallel mode. This
+  // function is idempotent if the |frame_header| and |thread_count| don't
+  // change between calls (it will only create new threads on the first call and
+  // do nothing on the subsequent calls). This function also starts the worker
+  // threads whenever it creates new thread pools.
+  // The following strategy is used to allocate threads:
+  //   * One thread is allocated for decoding each Tile.
+  //   * Any remaining threads are allocated for superblock row multi-threading
+  //     within each of the tile in a round robin fashion.
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header,
+                                     int thread_count);
+
+  // Creates or re-allocates a thread pool with |thread_count| threads. This
+  // function is used only in frame parallel mode. This function is idempotent
+  // if the |thread_count| doesn't change between calls (it will only create new
+  // threads on the first call and do nothing on the subsequent calls).
+  // Note: During the lifetime of a ThreadingStrategy object, only one of the
+  // Reset() variants will be used.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count);
+
+  // Returns a pointer to the ThreadPool that is to be used for Tile
+  // multi-threading.
+  ThreadPool* tile_thread_pool() const {
+    return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr;
+  }
+
+  int tile_thread_count() const { return tile_thread_count_; }
+
+  // Returns a pointer to the underlying ThreadPool.
+  // Note: Valid only when |frame_parallel_| is true. This is used for
+  // facilitating in-frame multi-threading in that case.
+  ThreadPool* thread_pool() const { return thread_pool_.get(); }
+
+  // Returns a pointer to the ThreadPool that is to be used within the Tile at
+  // index |tile_index| for superblock row multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* row_thread_pool(int tile_index) const {
+    return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get()
+                                                        : nullptr;
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for post filter
+  // multi-threading.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* post_filter_thread_pool() const {
+    return frame_parallel_ ? nullptr : thread_pool_.get();
+  }
+
+  // Returns a pointer to the ThreadPool that is to be used for film grain
+  // synthesis and blending.
+  // Note: Valid only when |frame_parallel_| is false.
+  ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); }
+
+ private:
+  std::unique_ptr<ThreadPool> thread_pool_;
+  int tile_thread_count_ = 0;
+  int max_tile_index_for_row_threads_ = 0;
+  bool frame_parallel_ = false;
+};
+
+// Initializes the |frame_thread_pool| and the necessary worker threadpools (the
+// threading_strategy objects in each of the frame scratch buffer in
+// |frame_scratch_buffer_pool|) as follows:
+//  * frame_threads = ComputeFrameThreadCount();
+//  * For more details on how frame_threads is computed, see the function
+//    comment in ComputeFrameThreadCount().
+//  * |frame_thread_pool| is created with |frame_threads| threads.
+//  * divide the remaining number of threads into each frame thread and
+//    initialize a frame_scratch_buffer.threading_strategy for each frame
+//    thread.
+//  When this function is called, |frame_scratch_buffer_pool| must be empty. If
+//  this function returns true, it means the initialization was successful and
+//  one of the following is true:
+//    * |frame_thread_pool| has been successfully initialized and
+//      |frame_scratch_buffer_pool| has been successfully populated with
+//      |frame_threads| buffers to be used by each frame thread. The total
+//      number of threads that this function creates will always be equal to
+//      |thread_count|.
+//    * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not
+//      modified. This means that frame threading will not be used and the
+//      decoder will continue to operate normally in non frame parallel mode.
+LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel(
+    int thread_count, int tile_count, int tile_columns,
+    std::unique_ptr<ThreadPool>* frame_thread_pool,
+    FrameScratchBufferPool* frame_scratch_buffer_pool);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_THREADING_STRATEGY_H_
diff --git a/src/threading_strategy_test.cc b/src/threading_strategy_test.cc
new file mode 100644
index 0000000..2a7a781
--- /dev/null
+++ b/src/threading_strategy_test.cc
@@ -0,0 +1,281 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/threading_strategy.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "gtest/gtest.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+class ThreadingStrategyTest : public testing::Test {
+ protected:
+  ThreadingStrategy strategy_;
+  ObuFrameHeader frame_header_ = {};
+};
+
+TEST_F(ThreadingStrategyTest, MaxThreadEnforced) {
+  frame_header_.tile_info.tile_count = 32;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 32));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 32; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, UseAllThreadsForTiles) {
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreads) {
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // Each tile should get 3 threads each.
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+TEST_F(ThreadingStrategyTest, RowThreadsUnequal) {
+  frame_header_.tile_info.tile_count = 2;
+
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 9));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(1), nullptr);
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Test a random combination of tile_count and thread_count.
+TEST_F(ThreadingStrategyTest, MultipleCalls) {
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 8));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // Row threads must have been reset.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 8;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 4;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 16));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  // All the other row threads must be reset.
+  for (int i = 4; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 4;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 6));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  // First two tiles will get 1 thread each.
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  // All the other row threads must be reset.
+  for (int i = 2; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 1));
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_EQ(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+// Tests the following order of calls (with thread count fixed at 4):
+//  * 1 Tile - 2 Tiles - 1 Tile.
+TEST_F(ThreadingStrategyTest, MultipleCalls2) {
+  frame_header_.tile_info.tile_count = 1;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  // When there is only one tile, tile thread pool must be nullptr.
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  for (int i = 1; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 2;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  EXPECT_NE(strategy_.tile_thread_pool(), nullptr);
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_NE(strategy_.row_thread_pool(i), nullptr);
+  }
+  for (int i = 2; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+
+  frame_header_.tile_info.tile_count = 1;
+  ASSERT_TRUE(strategy_.Reset(frame_header_, 4));
+  EXPECT_EQ(strategy_.tile_thread_pool(), nullptr);
+  EXPECT_NE(strategy_.row_thread_pool(0), nullptr);
+  for (int i = 1; i < 8; ++i) {
+    EXPECT_EQ(strategy_.row_thread_pool(i), nullptr);
+  }
+  EXPECT_NE(strategy_.post_filter_thread_pool(), nullptr);
+}
+
+void VerifyFrameParallel(int thread_count, int tile_count, int tile_columns,
+                         int expected_frame_threads,
+                         const std::vector<int>& expected_tile_threads) {
+  ASSERT_EQ(expected_frame_threads, expected_tile_threads.size());
+  ASSERT_GT(thread_count, 1);
+  std::unique_ptr<ThreadPool> frame_thread_pool;
+  FrameScratchBufferPool frame_scratch_buffer_pool;
+  ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+      thread_count, tile_count, tile_columns, &frame_thread_pool,
+      &frame_scratch_buffer_pool));
+  if (expected_frame_threads == 0) {
+    EXPECT_EQ(frame_thread_pool, nullptr);
+    return;
+  }
+  EXPECT_NE(frame_thread_pool.get(), nullptr);
+  EXPECT_EQ(frame_thread_pool->num_threads(), expected_frame_threads);
+  std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  int actual_thread_count = frame_thread_pool->num_threads();
+  for (int i = 0; i < expected_frame_threads; ++i) {
+    SCOPED_TRACE(absl::StrCat("i: ", i));
+    frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+    ThreadPool* const thread_pool =
+        frame_scratch_buffers.back()->threading_strategy.thread_pool();
+    if (expected_tile_threads[i] > 0) {
+      EXPECT_NE(thread_pool, nullptr);
+      EXPECT_EQ(thread_pool->num_threads(), expected_tile_threads[i]);
+      actual_thread_count += thread_pool->num_threads();
+    } else {
+      EXPECT_EQ(thread_pool, nullptr);
+    }
+  }
+  EXPECT_EQ(thread_count, actual_thread_count);
+  for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+    frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+  }
+}
+
+TEST(FrameParallelStrategyTest, FrameParallel) {
+  // This loop has thread_count <= 3 * tile count. So there should be no frame
+  // threads irrespective of the number of tile columns.
+  for (int thread_count = 2; thread_count <= 6; ++thread_count) {
+    VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/1,
+                        /*expected_frame_threads=*/0,
+                        /*expected_tile_threads=*/{});
+    VerifyFrameParallel(thread_count, /*tile_count=*/2, /*tile_columns=*/2,
+                        /*expected_frame_threads=*/0,
+                        /*expected_tile_threads=*/{});
+  }
+
+  // Equal number of tile threads for each frame thread.
+  VerifyFrameParallel(
+      /*thread_count=*/8, /*tile_count=*/1, /*tile_columns=*/1,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{1, 1, 1, 1});
+  VerifyFrameParallel(
+      /*thread_count=*/12, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/18, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/6,
+      /*expected_tile_threads=*/{2, 2, 2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/16, /*tile_count=*/3, /*tile_columns=*/3,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 3, 3});
+
+  // Unequal number of tile threads for each frame thread.
+  VerifyFrameParallel(
+      /*thread_count=*/7, /*tile_count=*/1, /*tile_columns=*/1,
+      /*expected_frame_threads=*/3, /*expected_tile_threads=*/{2, 1, 1});
+  VerifyFrameParallel(
+      /*thread_count=*/14, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{3, 3, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/20, /*tile_count=*/2, /*tile_columns=*/2,
+      /*expected_frame_threads=*/6,
+      /*expected_tile_threads=*/{3, 3, 2, 2, 2, 2});
+  VerifyFrameParallel(
+      /*thread_count=*/17, /*tile_count=*/3, /*tile_columns=*/3,
+      /*expected_frame_threads=*/4, /*expected_tile_threads=*/{4, 3, 3, 3});
+}
+
+TEST(FrameParallelStrategyTest, ThreadCountDoesNotExceedkMaxThreads) {
+  std::unique_ptr<ThreadPool> frame_thread_pool;
+  FrameScratchBufferPool frame_scratch_buffer_pool;
+  ASSERT_TRUE(InitializeThreadPoolsForFrameParallel(
+      /*thread_count=*/kMaxThreads + 10, /*tile_count=*/2, /*tile_columns=*/2,
+      &frame_thread_pool, &frame_scratch_buffer_pool));
+  EXPECT_NE(frame_thread_pool.get(), nullptr);
+  std::vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
+  int actual_thread_count = frame_thread_pool->num_threads();
+  for (int i = 0; i < frame_thread_pool->num_threads(); ++i) {
+    SCOPED_TRACE(absl::StrCat("i: ", i));
+    frame_scratch_buffers.push_back(frame_scratch_buffer_pool.Get());
+    ThreadPool* const thread_pool =
+        frame_scratch_buffers.back()->threading_strategy.thread_pool();
+    if (thread_pool != nullptr) {
+      actual_thread_count += thread_pool->num_threads();
+    }
+  }
+  // In this case, the exact number of frame threads and tile threads depend on
+  // the value of kMaxThreads. So simply ensure that the total number of threads
+  // does not exceed kMaxThreads.
+  EXPECT_LE(actual_thread_count, kMaxThreads);
+  for (auto& frame_scratch_buffer : frame_scratch_buffers) {
+    frame_scratch_buffer_pool.Release(std::move(frame_scratch_buffer));
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/tile.h b/src/tile.h
new file mode 100644
index 0000000..83c3423
--- /dev/null
+++ b/src/tile.h
@@ -0,0 +1,953 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_H_
+#define LIBGAV1_SRC_TILE_H_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/decoder_state.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
+#include "src/post_filter.h"
+#include "src/quantizer.h"
+#include "src/residual_buffer_pool.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile_scratch_buffer.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/threadpool.h"
+#include "src/utils/types.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+
+// Indicates what the ProcessSuperBlock() and TransformBlock() functions should
+// do. "Parse" refers to consuming the bitstream, reading the transform
+// coefficients and performing the dequantization. "Decode" refers to computing
+// the prediction, applying the inverse transforms and adding the residual.
+enum ProcessingMode {
+  kProcessingModeParseOnly,
+  kProcessingModeDecodeOnly,
+  kProcessingModeParseAndDecode,
+};
+
+// The alignment requirement is due to the SymbolDecoderContext member
+// symbol_decoder_context_.
+class Tile : public MaxAlignedAllocable {
+ public:
+  static std::unique_ptr<Tile> Create(
+      int tile_number, const uint8_t* const data, size_t size,
+      const ObuSequenceHeader& sequence_header,
+      const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
+      const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+      const WedgeMaskArray& wedge_masks,
+      const QuantizerMatrix& quantizer_matrix,
+      SymbolDecoderContext* const saved_symbol_decoder_context,
+      const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
+      const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
+      BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+      bool use_intra_prediction_buffer) {
+    std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
+        tile_number, data, size, sequence_header, frame_header, current_frame,
+        state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+        saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+        thread_pool, pending_tiles, frame_parallel,
+        use_intra_prediction_buffer));
+    return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
+  }
+
+  // Move only.
+  Tile(Tile&& tile) noexcept;
+  Tile& operator=(Tile&& tile) noexcept;
+  Tile(const Tile&) = delete;
+  Tile& operator=(const Tile&) = delete;
+
+  struct Block;  // Defined after this class.
+
+  // Parses the entire tile.
+  bool Parse();
+  // Decodes the entire tile. |superblock_row_progress| and
+  // |superblock_row_progress_condvar| are arrays of size equal to the number of
+  // superblock rows in the frame. Increments |superblock_row_progress[i]| after
+  // each superblock row at index |i| is decoded. If the count reaches the
+  // number of tile columns, then it notifies
+  // |superblock_row_progress_condvar[i]|.
+  bool Decode(std::mutex* mutex, int* superblock_row_progress,
+              std::condition_variable* superblock_row_progress_condvar);
+  // Parses and decodes the entire tile. Depending on the configuration of this
+  // Tile, this function may do multithreaded decoding.
+  bool ParseAndDecode();  // 5.11.2.
+  // Processes all the columns of the superblock row at |row4x4| that are within
+  // this Tile. If |save_symbol_decoder_context| is true, then
+  // SaveSymbolDecoderContext() is invoked for the last superblock row.
+  template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+  bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer);
+
+  const ObuSequenceHeader& sequence_header() const { return sequence_header_; }
+  const ObuFrameHeader& frame_header() const { return frame_header_; }
+  const RefCountedBuffer& current_frame() const { return current_frame_; }
+  const TemporalMotionField& motion_field() const { return motion_field_; }
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias()
+      const {
+    return reference_frame_sign_bias_;
+  }
+
+  bool IsRow4x4Inside(int row4x4) const {
+    return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_;
+  }
+
+  // 5.11.51.
+  bool IsInside(int row4x4, int column4x4) const {
+    return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ &&
+           column4x4 < column4x4_end_;
+  }
+
+  bool IsLeftInside(int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the left column
+    // offset column4x4 - 1.
+    assert(column4x4 <= column4x4_end_);
+    return column4x4 > column4x4_start_;
+  }
+
+  bool IsTopInside(int row4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    return row4x4 > row4x4_start_;
+  }
+
+  bool IsTopLeftInside(int row4x4, int column4x4) const {
+    // We use "larger than" as the condition. Don't pass in the top row offset
+    // row4x4 - 1 or the left column offset column4x4 - 1.
+    assert(row4x4 <= row4x4_end_);
+    assert(column4x4 <= column4x4_end_);
+    return row4x4 > row4x4_start_ && column4x4 > column4x4_start_;
+  }
+
+  bool IsBottomRightInside(int row4x4, int column4x4) const {
+    assert(row4x4 >= row4x4_start_);
+    assert(column4x4 >= column4x4_start_);
+    return row4x4 < row4x4_end_ && column4x4 < column4x4_end_;
+  }
+
+  BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const {
+    return block_parameters_holder_.Address(row4x4, column4x4);
+  }
+
+  int BlockParametersStride() const {
+    return block_parameters_holder_.columns4x4();
+  }
+
+  // Returns true if Parameters() can be called with |row| and |column| as
+  // inputs, false otherwise.
+  bool HasParameters(int row, int column) const {
+    return block_parameters_holder_.Find(row, column) != nullptr;
+  }
+  const BlockParameters& Parameters(int row, int column) const {
+    return *block_parameters_holder_.Find(row, column);
+  }
+
+  int number() const { return number_; }
+  int superblock_rows() const { return superblock_rows_; }
+  int superblock_columns() const { return superblock_columns_; }
+  int row4x4_start() const { return row4x4_start_; }
+  int column4x4_start() const { return column4x4_start_; }
+  int column4x4_end() const { return column4x4_end_; }
+
+ private:
+  // Stores the transform tree state when reading variable size transform trees
+  // and when applying the transform tree. When applying the transform tree,
+  // |depth| is not used.
+  struct TransformTreeNode {
+    // The default constructor is invoked by the Stack<TransformTreeNode, n>
+    // constructor. Stack<> does not use the default-constructed elements, so it
+    // is safe for the default constructor to not initialize the members.
+    TransformTreeNode() = default;
+    TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1)
+        : x(x), y(y), tx_size(tx_size), depth(depth) {}
+
+    int x;
+    int y;
+    TransformSize tx_size;
+    int depth;
+  };
+
+  // Enum to track the processing state of a superblock.
+  enum SuperBlockState : uint8_t {
+    kSuperBlockStateNone,       // Not yet parsed or decoded.
+    kSuperBlockStateParsed,     // Parsed but not yet decoded.
+    kSuperBlockStateScheduled,  // Scheduled for decoding.
+    kSuperBlockStateDecoded     // Parsed and decoded.
+  };
+
+  // Parameters used to facilitate multi-threading within the Tile.
+  struct ThreadingParameters {
+    std::mutex mutex;
+    // 2d array of size |superblock_rows_| by |superblock_columns_| containing
+    // the processing state of each superblock.
+    Array2D<SuperBlockState> sb_state LIBGAV1_GUARDED_BY(mutex);
+    // Variable used to indicate either parse or decode failure.
+    bool abort LIBGAV1_GUARDED_BY(mutex) = false;
+    int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0;
+    std::condition_variable pending_jobs_zero_condvar;
+  };
+
+  // The residual pointer is used to traverse the |residual_buffer_|. It is
+  // used in two different ways.
+  // If |split_parse_and_decode_| is true:
+  //    The pointer points to the beginning of the |residual_buffer_| when the
+  //    "parse" and "decode" steps begin. It is then moved forward tx_size in
+  //    each iteration of the "parse" and the "decode" steps. In this case, the
+  //    ResidualPtr variable passed into various functions starting from
+  //    ProcessSuperBlock is used as an in/out parameter to keep track of the
+  //    residual pointer.
+  // If |split_parse_and_decode_| is false:
+  //    The pointer is reset to the beginning of the |residual_buffer_| for
+  //    every transform block.
+  using ResidualPtr = uint8_t*;
+
+  Tile(int tile_number, const uint8_t* data, size_t size,
+       const ObuSequenceHeader& sequence_header,
+       const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+       const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+       const WedgeMaskArray& wedge_masks,
+       const QuantizerMatrix& quantizer_matrix,
+       SymbolDecoderContext* saved_symbol_decoder_context,
+       const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+       const dsp::Dsp* dsp, ThreadPool* thread_pool,
+       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+       bool use_intra_prediction_buffer);
+
+  // Performs member initializations that may fail. Helper function used by
+  // Create().
+  LIBGAV1_MUST_USE_RESULT bool Init();
+
+  // Saves the symbol decoder context of this tile into
+  // |saved_symbol_decoder_context_| if necessary.
+  void SaveSymbolDecoderContext();
+
+  // Entry point for multi-threaded decoding. This function performs the same
+  // functionality as ParseAndDecode(). The current thread does the "parse" step
+  // while the worker threads do the "decode" step.
+  bool ThreadedParseAndDecode();
+
+  // Returns whether or not the prerequisites for decoding the superblock at
+  // |row_index| and |column_index| are satisfied. |threading_.mutex| must be
+  // held when calling this function.
+  bool CanDecode(int row_index, int column_index) const;
+
+  // This function is run by the worker threads when multi-threaded decoding is
+  // enabled. Once a superblock is decoded, this function will set the
+  // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On
+  // failure, |threading_.abort| will be set to true. If at any point
+  // |threading_.abort| becomes true, this function will return as early as it
+  // can. If the decoding succeeds, this function will also schedule the
+  // decoding jobs for the superblock to the bottom-left and the superblock to
+  // the right of this superblock (if it is allowed).
+  void DecodeSuperBlock(int row_index, int column_index, int block_width4x4);
+
+  // If |use_intra_prediction_buffer_| is true, then this function copies the
+  // last row of the superblockrow starting at |row4x4| into the
+  // |intra_prediction_buffer_| (which may be used by the intra prediction
+  // process for the next superblock row).
+  void PopulateIntraPredictionBuffer(int row4x4);
+
+  uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size);
+  bool ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                     bool has_rows, bool has_columns, Partition* partition);
+  // Processes the Partition starting at |row4x4_start|, |column4x4_start|
+  // iteratively. It performs a DFS traversal over the partition tree to process
+  // the blocks in the right order.
+  bool ProcessPartition(
+      int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
+      ResidualPtr* residual);  // Iterative implementation of 5.11.4.
+  bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                    TileScratchBuffer* scratch_buffer,
+                    ResidualPtr* residual);   // 5.11.5.
+  void ResetCdef(int row4x4, int column4x4);  // 5.11.55.
+
+  // This function is used to decode a superblock when the parsing has already
+  // been done for that superblock.
+  bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                        TileScratchBuffer* scratch_buffer);
+  // Helper function used by DecodeSuperBlock(). Note that the decode_block()
+  // function in the spec is equivalent to ProcessBlock() in the code.
+  bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                   TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
+
+  void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
+                         int column4x4);  // 5.11.3.
+  bool ProcessSuperBlock(int row4x4, int column4x4,
+                         TileScratchBuffer* scratch_buffer,
+                         ProcessingMode mode);
+  void ResetLoopRestorationParams();
+  void ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                       BlockSize block_size);  // 5.11.57.
+
+  // Helper functions for DecodeBlock.
+  bool ReadSegmentId(const Block& block);       // 5.11.9.
+  bool ReadIntraSegmentId(const Block& block);  // 5.11.8.
+  void ReadSkip(const Block& block);            // 5.11.11.
+  bool ReadSkipMode(const Block& block);        // 5.11.10.
+  void ReadCdef(const Block& block);            // 5.11.56.
+  // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1.
+  int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value,
+                       int max_value, int value);
+  void ReadQuantizerIndexDelta(const Block& block);  // 5.11.12.
+  void ReadLoopFilterDelta(const Block& block);      // 5.11.13.
+  // Populates |BlockParameters::deblock_filter_level| for the given |block|
+  // using |deblock_filter_levels_|.
+  void PopulateDeblockFilterLevel(const Block& block);
+  void PopulateCdefSkip(const Block& block);
+  void ReadPredictionModeY(const Block& block, bool intra_y_mode);
+  void ReadIntraAngleInfo(const Block& block,
+                          PlaneType plane_type);  // 5.11.42 and 5.11.43.
+  void ReadPredictionModeUV(const Block& block);
+  void ReadCflAlpha(const Block& block);  // 5.11.45.
+  int GetPaletteCache(const Block& block, PlaneType plane_type,
+                      uint16_t* cache);
+  void ReadPaletteColors(const Block& block, Plane plane);
+  void ReadPaletteModeInfo(const Block& block);      // 5.11.46.
+  void ReadFilterIntraModeInfo(const Block& block);  // 5.11.24.
+  int ReadMotionVectorComponent(const Block& block,
+                                int component);                // 5.11.32.
+  void ReadMotionVector(const Block& block, int index);        // 5.11.31.
+  bool DecodeIntraModeInfo(const Block& block);                // 5.11.7.
+  int8_t ComputePredictedSegmentId(const Block& block) const;  // 5.11.21.
+  bool ReadInterSegmentId(const Block& block, bool pre_skip);  // 5.11.19.
+  void ReadIsInter(const Block& block, bool skip_mode);        // 5.11.20.
+  bool ReadIntraBlockModeInfo(const Block& block,
+                              bool intra_y_mode);  // 5.11.22.
+  CompoundReferenceType ReadCompoundReferenceType(const Block& block);
+  template <bool is_single, bool is_backward, int index>
+  uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type =
+                                                    kNumCompoundReferenceTypes);
+  void ReadReferenceFrames(const Block& block, bool skip_mode);  // 5.11.25.
+  void ReadInterPredictionModeY(const Block& block,
+                                const MvContexts& mode_contexts,
+                                bool skip_mode);
+  void ReadRefMvIndex(const Block& block);
+  void ReadInterIntraMode(const Block& block, bool is_compound,
+                          bool skip_mode);        // 5.11.28.
+  bool IsScaled(ReferenceFrameType type) const {  // Part of 5.11.27.
+    const int index =
+        frame_header_.reference_frame_index[type - kReferenceFrameLast];
+    return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+           reference_frames_[index]->frame_height() != frame_header_.height;
+  }
+  void ReadMotionMode(const Block& block, bool is_compound,
+                      bool skip_mode);  // 5.11.27.
+  uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
+  uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
+  void ReadCompoundType(const Block& block, bool is_compound, bool skip_mode,
+                        bool* is_explicit_compound_type,
+                        bool* is_compound_type_average);  // 5.11.29.
+  uint16_t* GetInterpolationFilterCdf(const Block& block, int direction);
+  void ReadInterpolationFilter(const Block& block, bool skip_mode);
+  bool ReadInterBlockModeInfo(const Block& block, bool skip_mode);  // 5.11.23.
+  bool DecodeInterModeInfo(const Block& block);                     // 5.11.18.
+  bool DecodeModeInfo(const Block& block);                          // 5.11.6.
+  bool IsMvValid(const Block& block, bool is_compound) const;       // 6.10.25.
+  bool AssignInterMv(const Block& block, bool is_compound);         // 5.11.26.
+  bool AssignIntraMv(const Block& block);                           // 5.11.26.
+  int GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                           bool ignore_skip);
+  int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                             bool ignore_skip);
+  TransformSize ReadFixedTransformSize(const Block& block);  // 5.11.15.
+  // Iterative implementation of 5.11.17.
+  void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4,
+                                 TransformSize tx_size);
+  void DecodeTransformSize(const Block& block);  // 5.11.16.
+  bool ComputePrediction(const Block& block);    // 5.11.33.
+  // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and
+  // |h4| are the width and height in 4x4 units of |tx_size|.
+  int GetTransformAllZeroContext(const Block& block, Plane plane,
+                                 TransformSize tx_size, int x4, int y4, int w4,
+                                 int h4);
+  TransformSet GetTransformSet(TransformSize tx_size,
+                               bool is_inter) const;  // 5.11.48.
+  TransformType ComputeTransformType(const Block& block, Plane plane,
+                                     TransformSize tx_size, int block_x,
+                                     int block_y);  // 5.11.40.
+  void ReadTransformType(const Block& block, int x4, int y4,
+                         TransformSize tx_size);  // 5.11.47.
+  template <typename ResidualType>
+  void ReadCoeffBase2D(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseHorizontal(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  template <typename ResidualType>
+  void ReadCoeffBaseVertical(
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
+      uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
+  int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
+  void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                          uint8_t coefficient_level, int8_t dc_category);
+  void InterIntraPrediction(
+      uint16_t* prediction_0, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride,
+      const PredictionParameters& prediction_parameters, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  void CompoundInterPrediction(
+      const Block& block, const uint8_t* prediction_mask,
+      ptrdiff_t prediction_mask_stride, int prediction_width,
+      int prediction_height, int subsampling_x, int subsampling_y,
+      int candidate_row, int candidate_column, uint8_t* dest,
+      ptrdiff_t dest_stride);  // Part of section 7.11.3.1 in the spec.
+  GlobalMotion* GetWarpParams(const Block& block, Plane plane,
+                              int prediction_width, int prediction_height,
+                              const PredictionParameters& prediction_parameters,
+                              ReferenceFrameType reference_type,
+                              bool* is_local_valid,
+                              GlobalMotion* global_motion_params,
+                              GlobalMotion* local_warp_params)
+      const;  // Part of section 7.11.3.1 in the spec.
+  bool InterPrediction(const Block& block, Plane plane, int x, int y,
+                       int prediction_width, int prediction_height,
+                       int candidate_row, int candidate_column,
+                       bool* is_local_valid,
+                       GlobalMotion* local_warp_params);  // 7.11.3.1.
+  void ScaleMotionVector(const MotionVector& mv, Plane plane,
+                         int reference_frame_index, int x, int y, int* start_x,
+                         int* start_y, int* step_x, int* step_y);  // 7.11.3.3.
+  // If the method returns false, the caller only uses the output parameters
+  // *ref_block_start_x and *ref_block_start_y. If the method returns true, the
+  // caller uses all three output parameters.
+  static bool GetReferenceBlockPosition(
+      int reference_frame_index, bool is_scaled, int width, int height,
+      int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y,
+      int start_x, int start_y, int step_x, int step_y, int left_border,
+      int right_border, int top_border, int bottom_border,
+      int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x);
+
+  template <typename Pixel>
+  void BuildConvolveBlock(Plane plane, int reference_frame_index,
+                          bool is_scaled, int height, int ref_start_x,
+                          int ref_last_x, int ref_start_y, int ref_last_y,
+                          int step_y, int ref_block_start_x,
+                          int ref_block_end_x, int ref_block_start_y,
+                          uint8_t* block_buffer,
+                          ptrdiff_t convolve_buffer_stride,
+                          ptrdiff_t block_extended_width);
+  bool BlockInterPrediction(const Block& block, Plane plane,
+                            int reference_frame_index, const MotionVector& mv,
+                            int x, int y, int width, int height,
+                            int candidate_row, int candidate_column,
+                            uint16_t* prediction, bool is_compound,
+                            bool is_inter_intra, uint8_t* dest,
+                            ptrdiff_t dest_stride);  // 7.11.3.4.
+  bool BlockWarpProcess(const Block& block, Plane plane, int index,
+                        int block_start_x, int block_start_y, int width,
+                        int height, GlobalMotion* warp_params, bool is_compound,
+                        bool is_inter_intra, uint8_t* dest,
+                        ptrdiff_t dest_stride);  // 7.11.3.5.
+  bool ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                           Plane plane, int reference_frame_index, int width,
+                           int height, int x, int y, int candidate_row,
+                           int candidate_column,
+                           ObmcDirection blending_direction);
+  bool ObmcPrediction(const Block& block, Plane plane, int width,
+                      int height);  // 7.11.3.9.
+  void DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                  int width, int height, int candidate_row,
+                                  int candidate_column, uint8_t* dest,
+                                  ptrdiff_t dest_stride);  // 7.11.3.15.
+  // This function specializes the parsing of DC coefficient by removing some of
+  // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always
+  // non-zero for all other possible values of i). |dc_category| is an output
+  // parameter that is populated when |is_dc_coefficient| is true.
+  // |coefficient_level| is an output parameter which accumulates the
+  // coefficient level.
+  template <typename ResidualType, bool is_dc_coefficient>
+  LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization(
+      const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
+      int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
+      int* coefficient_level,
+      ResidualType* residual_buffer);     // Part of 5.11.39.
+  int ReadCoeffBaseRange(uint16_t* cdf);  // Part of 5.11.39.
+  // Returns the number of non-zero coefficients that were read. |tx_type| is an
+  // output parameter that stores the computed transform type for the plane
+  // whose coefficients were read. Returns -1 on failure.
+  template <typename ResidualType>
+  int ReadTransformCoefficients(const Block& block, Plane plane, int start_x,
+                                int start_y, TransformSize tx_size,
+                                TransformType* tx_type);  // 5.11.39.
+  bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y,
+                      TransformSize tx_size, int x, int y,
+                      ProcessingMode mode);  // 5.11.35.
+  // Iterative implementation of 5.11.36.
+  bool TransformTree(const Block& block, int start_x, int start_y,
+                     BlockSize plane_size, ProcessingMode mode);
+  void ReconstructBlock(const Block& block, Plane plane, int start_x,
+                        int start_y, TransformSize tx_size,
+                        TransformType tx_type,
+                        int non_zero_coeff_count);         // Part of 7.12.3.
+  bool Residual(const Block& block, ProcessingMode mode);  // 5.11.34.
+  // part of 5.11.5 (reset_block_context() in the spec).
+  void ResetEntropyContext(const Block& block);
+  // Populates the |color_context| and |color_order| for the |i|th iteration
+  // with entries counting down from |start| to |end| (|start| > |end|).
+  void PopulatePaletteColorContexts(
+      const Block& block, PlaneType plane_type, int i, int start, int end,
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+      uint8_t color_context[kMaxPaletteSquare]);  // 5.11.50.
+  bool ReadPaletteTokens(const Block& block);     // 5.11.49.
+  template <typename Pixel>
+  void IntraPrediction(const Block& block, Plane plane, int x, int y,
+                       bool has_left, bool has_top, bool has_top_right,
+                       bool has_bottom_left, PredictionMode mode,
+                       TransformSize tx_size);
+  int GetIntraEdgeFilterType(const Block& block,
+                             Plane plane) const;  // 7.11.2.8.
+  template <typename Pixel>
+  void DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                             bool has_left, bool has_top, bool needs_left,
+                             bool needs_top, int prediction_angle, int width,
+                             int height, int max_x, int max_y,
+                             TransformSize tx_size, Pixel* top_row,
+                             Pixel* left_column);  // 7.11.2.4.
+  template <typename Pixel>
+  void PalettePrediction(const Block& block, Plane plane, int start_x,
+                         int start_y, int x, int y,
+                         TransformSize tx_size);  // 7.11.4.
+  template <typename Pixel>
+  void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x,
+                                int start_y,
+                                TransformSize tx_size);  // 7.11.5.
+  // Section 7.19. Applies some filtering and reordering to the motion vectors
+  // for the given |block| and stores them into |current_frame_|.
+  void StoreMotionFieldMvsIntoCurrentFrame(const Block& block);
+
+  // SetCdfContext*() functions will populate the |left_context_| and
+  // |top_context_| for the |block|.
+  void SetCdfContextUsePredictedSegmentId(const Block& block,
+                                          bool use_predicted_segment_id);
+  void SetCdfContextCompoundType(const Block& block,
+                                 bool is_explicit_compound_type,
+                                 bool is_compound_type_average);
+  void SetCdfContextSkipMode(const Block& block, bool skip_mode);
+  void SetCdfContextPaletteSize(const Block& block);
+  void SetCdfContextUVMode(const Block& block);
+
+  // Returns the zero-based index of the super block that contains |row4x4|
+  // relative to the start of this tile.
+  int SuperBlockRowIndex(int row4x4) const {
+    return (row4x4 - row4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  // Returns the zero-based index of the super block that contains |column4x4|
+  // relative to the start of this tile.
+  int SuperBlockColumnIndex(int column4x4) const {
+    return (column4x4 - column4x4_start_) >>
+           (sequence_header_.use_128x128_superblock ? 5 : 4);
+  }
+
+  // Returns the zero-based index of the block that starts at row4x4 or
+  // column4x4 relative to the start of the superblock that contains the block.
+  // This is used to index into the members of |left_context_| and
+  // |top_context_|.
+  int CdfContextIndex(int row_or_column4x4) const {
+    return row_or_column4x4 -
+           (row_or_column4x4 &
+            (sequence_header_.use_128x128_superblock ? ~31 : ~15));
+  }
+
+  BlockSize SuperBlockSize() const {
+    return sequence_header_.use_128x128_superblock ? kBlock128x128
+                                                   : kBlock64x64;
+  }
+  int PlaneCount() const {
+    return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome
+                                                       : kMaxPlanes;
+  }
+
+  const int number_;
+  const int row_;
+  const int column_;
+  const uint8_t* const data_;
+  size_t size_;
+  int row4x4_start_;
+  int row4x4_end_;
+  int column4x4_start_;
+  int column4x4_end_;
+  int superblock_rows_;
+  int superblock_columns_;
+  bool read_deltas_;
+  const int8_t subsampling_x_[kMaxPlanes];
+  const int8_t subsampling_y_[kMaxPlanes];
+
+  // The dimensions (in order) are: segment_id, level_index (based on plane and
+  // direction), reference_frame and mode_id.
+  uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount]
+                                [kNumReferenceFrameTypes][2];
+
+  // current_quantizer_index_ is in the range [0, 255].
+  uint8_t current_quantizer_index_;
+  // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to
+  // store the entropy context. Their dimensions are as follows: First -
+  // left/top; Second - plane; Third - row4x4 (if first dimension is
+  // left)/column4x4 (if first dimension is top).
+  //
+  // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in
+  // the spec. In the spec, it stores values from 0 through 63 (inclusive). The
+  // stored values are used to compute the left and top contexts in
+  // GetTransformAllZeroContext. In that function, we only care about the
+  // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we
+  // clamp to 4 (i.e.) all the values greater than 4 are stored as 4.
+  std::array<Array2D<uint8_t>, 2> coefficient_levels_;
+  // This is equivalent to the LeftDcContext and AboveDcContext arrays in the
+  // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1
+  // means the value is < 0, 2 means the value is > 0 and 0 means the value is
+  // equal to 0).
+  //
+  // The stored values are used in two places:
+  //  * GetTransformAllZeroContext: Here, we only care about whether the
+  //  value is 0 or not (whether it is 1 or 2 is irrelevant).
+  //  * GetDcSignContext: Here, we do the following computation: if the
+  //  stored value is 1, we decrement a counter. If the stored value is 2
+  //  we increment a counter.
+  //
+  // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and
+  // use that value to compute the counter.
+  //
+  // The usage on GetTransformAllZeroContext is unaffected since there we
+  // only care about whether it is 0 or not.
+  std::array<Array2D<int8_t>, 2> dc_categories_;
+  const ObuSequenceHeader& sequence_header_;
+  const ObuFrameHeader& frame_header_;
+  const std::array<bool, kNumReferenceFrameTypes>& reference_frame_sign_bias_;
+  const std::array<RefCountedBufferPtr, kNumReferenceFrameTypes>&
+      reference_frames_;
+  TemporalMotionField& motion_field_;
+  const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
+  const WedgeMaskArray& wedge_masks_;
+  const QuantizerMatrix& quantizer_matrix_;
+  EntropyDecoder reader_;
+  SymbolDecoderContext symbol_decoder_context_;
+  SymbolDecoderContext* const saved_symbol_decoder_context_;
+  const SegmentationMap* prev_segment_ids_;
+  const dsp::Dsp& dsp_;
+  PostFilter& post_filter_;
+  BlockParametersHolder& block_parameters_holder_;
+  Quantizer quantizer_;
+  // When there is no multi-threading within the Tile, |residual_buffer_| is
+  // used. When there is multi-threading within the Tile,
+  // |residual_buffer_threaded_| is used. In the following comment,
+  // |residual_buffer| refers to either |residual_buffer_| or
+  // |residual_buffer_threaded_| depending on whether multi-threading is enabled
+  // within the Tile or not.
+  // The |residual_buffer| is used to help with the dequantization and the
+  // inverse transform processes. It is declared as a uint8_t, but is always
+  // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is
+  // what it stores at various stages of the decoding process (in the order
+  // which they happen):
+  //   1) In ReadTransformCoefficients(), this buffer is used to store the
+  //   dequantized values.
+  //   2) In Reconstruct(), this buffer is used as the input to the row
+  //   transform process.
+  // The size of this buffer would be:
+  //    For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) *
+  //        |residual_size_|. Where 4096 = 64x64 which is the maximum transform
+  //        size, and 32 * |kResidualPaddingVertical| is the padding to avoid
+  //        bottom boundary checks when parsing quantized coefficients. This
+  //        memory is allocated and owned by the Tile class.
+  //    For |residual_buffer_threaded_|: See the comment below. This memory is
+  //        not allocated or owned by the Tile class.
+  AlignedUniquePtr<uint8_t> residual_buffer_;
+  // This is a 2d array of pointers of size |superblock_rows_| by
+  // |superblock_columns_| where each pointer points to a ResidualBuffer for a
+  // single super block. The array is populated when the parsing process begins
+  // by calling |residual_buffer_pool_->Get()| and the memory is released back
+  // to the pool by calling |residual_buffer_pool_->Release()| when the decoding
+  // process is complete.
+  Array2D<std::unique_ptr<ResidualBuffer>> residual_buffer_threaded_;
+  // sizeof(int16_t or int32_t) depending on |bitdepth|.
+  const size_t residual_size_;
+  // Number of superblocks on the top-right that will have to be decoded before
+  // the current superblock can be decoded. This will be 1 if allow_intrabc is
+  // false. If allow_intrabc is true, then this value will be
+  // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for
+  // the top rows for intrabc.
+  const int intra_block_copy_lag_;
+
+  // In the Tile class, we use the "current_frame" in two ways:
+  //   1) To write the decoded output into (using the |buffer_| view).
+  //   2) To read the pixels for intra block copy (using the |current_frame_|
+  //      reference).
+  //
+  // When intra block copy is off, |buffer_| and |current_frame_| may or may not
+  // point to the same plane pointers. But it is okay since |current_frame_| is
+  // never used in this case.
+  //
+  // When intra block copy is on, |buffer_| and |current_frame_| always point to
+  // the same plane pointers (since post filtering is disabled). So the usage in
+  // both case 1 and case 2 remain valid.
+  Array2DView<uint8_t> buffer_[kMaxPlanes];
+  RefCountedBuffer& current_frame_;
+
+  Array2D<int8_t>& cdef_index_;
+  Array2D<uint8_t>& cdef_skip_;
+  Array2D<TransformSize>& inter_transform_sizes_;
+  std::array<RestorationUnitInfo, kMaxPlanes> reference_unit_info_;
+  // If |thread_pool_| is nullptr, the calling thread will do the parsing and
+  // the decoding in one pass. If |thread_pool_| is not nullptr, then the main
+  // thread will do the parsing while the thread pool workers will do the
+  // decoding.
+  ThreadPool* const thread_pool_;
+  ThreadingParameters threading_;
+  ResidualBufferPool* const residual_buffer_pool_;
+  TileScratchBufferPool* const tile_scratch_buffer_pool_;
+  BlockingCounterWithStatus* const pending_tiles_;
+  bool split_parse_and_decode_;
+  // This is used only when |split_parse_and_decode_| is false.
+  std::unique_ptr<PredictionParameters> prediction_parameters_ = nullptr;
+  // Stores the |transform_type| for the super block being decoded at a 4x4
+  // granularity. The spec uses absolute indices for this array but it is
+  // sufficient to use indices relative to the super block being decoded.
+  TransformType transform_types_[32][32];
+  // delta_lf_[i] is in the range [-63, 63].
+  int8_t delta_lf_[kFrameLfCount];
+  // True if all the values in |delta_lf_| are zero. False otherwise.
+  bool delta_lf_all_zero_;
+  const bool frame_parallel_;
+  const bool use_intra_prediction_buffer_;
+  // Buffer used to store the unfiltered pixels that are necessary for decoding
+  // the next superblock row (for the intra prediction process). Used only if
+  // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains
+  // one row buffer for each tile row. This tile will have to use the buffer
+  // corresponding to this tile's row.
+  IntraPredictionBuffer* const intra_prediction_buffer_;
+  // Stores the progress of the reference frames. This will be used to avoid
+  // unnecessary calls into RefCountedBuffer::WaitUntil().
+  std::array<int, kNumReferenceFrameTypes> reference_frame_progress_cache_;
+  // Stores the CDF contexts necessary for the "left" block.
+  BlockCdfContext left_context_;
+  // Stores the CDF contexts necessary for the "top" block. The size of this
+  // buffer is the number of superblock columns in this tile. For each block,
+  // the access index will be the corresponding SuperBlockColumnIndex()'th
+  // entry.
+  DynamicBuffer<BlockCdfContext> top_context_;
+};
+
+struct Tile::Block {
+  Block(Tile* tile_ptr, BlockSize size, int row4x4, int column4x4,
+        TileScratchBuffer* const scratch_buffer, ResidualPtr* residual)
+      : tile(*tile_ptr),
+        size(size),
+        row4x4(row4x4),
+        column4x4(column4x4),
+        width(kBlockWidthPixels[size]),
+        height(kBlockHeightPixels[size]),
+        width4x4(width >> 2),
+        height4x4(height >> 2),
+        scratch_buffer(scratch_buffer),
+        residual(residual),
+        top_context(tile.top_context_.get() +
+                    tile.SuperBlockColumnIndex(column4x4)),
+        top_context_index(tile.CdfContextIndex(column4x4)),
+        left_context_index(tile.CdfContextIndex(row4x4)) {
+    assert(size != kBlockInvalid);
+    residual_size[kPlaneY] = kPlaneResidualSize[size][0][0];
+    residual_size[kPlaneU] = residual_size[kPlaneV] =
+        kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]]
+                          [tile.subsampling_y_[kPlaneU]];
+    assert(residual_size[kPlaneY] != kBlockInvalid);
+    if (tile.PlaneCount() > 1) {
+      assert(residual_size[kPlaneU] != kBlockInvalid);
+    }
+    if ((row4x4 & 1) == 0 &&
+        (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) {
+      has_chroma = false;
+    } else if ((column4x4 & 1) == 0 &&
+               (tile.sequence_header_.color_config.subsampling_x & width4x4) ==
+                   1) {
+      has_chroma = false;
+    } else {
+      has_chroma = !tile.sequence_header_.color_config.is_monochrome;
+    }
+    top_available[kPlaneY] = tile.IsTopInside(row4x4);
+    left_available[kPlaneY] = tile.IsLeftInside(column4x4);
+    if (has_chroma) {
+      // top_available[kPlaneU] and top_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // top_available[kPlaneU] = top_available[kPlaneV] =
+      //     top_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_y & height4x4) ==
+      //     0 || tile.IsTopInside(row4x4 - 1));
+      top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside(
+          row4x4 -
+          (tile.sequence_header_.color_config.subsampling_y & height4x4));
+      // left_available[kPlaneU] and left_available[kPlaneV] are valid only if
+      // has_chroma is true.
+      // The next 3 lines are equivalent to:
+      // left_available[kPlaneU] = left_available[kPlaneV] =
+      //     left_available[kPlaneY] &&
+      //     ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0
+      //      || tile.IsLeftInside(column4x4 - 1));
+      left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside(
+          column4x4 -
+          (tile.sequence_header_.color_config.subsampling_x & width4x4));
+    }
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps =
+        tile.BlockParametersAddress(row4x4, column4x4);
+    bp = *bps;
+    // bp_top is valid only if top_available[kPlaneY] is true.
+    if (top_available[kPlaneY]) {
+      bp_top = *(bps - stride);
+    }
+    // bp_left is valid only if left_available[kPlaneY] is true.
+    if (left_available[kPlaneY]) {
+      bp_left = *(bps - 1);
+    }
+  }
+
+  bool HasChroma() const { return has_chroma; }
+
+  // These return values of these group of functions are valid only if the
+  // corresponding top_available or left_available is true.
+  ReferenceFrameType TopReference(int index) const {
+    return bp_top->reference_frame[index];
+  }
+
+  ReferenceFrameType LeftReference(int index) const {
+    return bp_left->reference_frame[index];
+  }
+
+  bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; }
+  bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; }
+
+  bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; }
+  bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; }
+
+  int CountReferences(ReferenceFrameType type) const {
+    return static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[0] == type) +
+           static_cast<int>(top_available[kPlaneY] &&
+                            bp_top->reference_frame[1] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[0] == type) +
+           static_cast<int>(left_available[kPlaneY] &&
+                            bp_left->reference_frame[1] == type);
+  }
+
+  // 7.10.3.
+  // Checks if there are any inter blocks to the left or above. If so, it
+  // returns true indicating that the block has neighbors that are suitable for
+  // use by overlapped motion compensation.
+  bool HasOverlappableCandidates() const {
+    const ptrdiff_t stride = tile.BlockParametersStride();
+    BlockParameters** const bps = tile.BlockParametersAddress(0, 0);
+    if (top_available[kPlaneY]) {
+      BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1);
+      const int columns = std::min(tile.frame_header_.columns4x4 - column4x4,
+                                   static_cast<int>(width4x4));
+      BlockParameters** const bps_top_end = bps_top + columns;
+      do {
+        if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_top += 2;
+      } while (bps_top < bps_top_end);
+    }
+    if (left_available[kPlaneY]) {
+      BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1;
+      const int rows = std::min(tile.frame_header_.rows4x4 - row4x4,
+                                static_cast<int>(height4x4));
+      BlockParameters** const bps_left_end = bps_left + rows * stride;
+      do {
+        if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) {
+          return true;
+        }
+        bps_left += 2 * stride;
+      } while (bps_left < bps_left_end);
+    }
+    return false;
+  }
+
+  Tile& tile;
+  bool has_chroma;
+  const BlockSize size;
+  bool top_available[kMaxPlanes];
+  bool left_available[kMaxPlanes];
+  BlockSize residual_size[kMaxPlanes];
+  const int row4x4;
+  const int column4x4;
+  const int width;
+  const int height;
+  const int width4x4;
+  const int height4x4;
+  const BlockParameters* bp_top;
+  const BlockParameters* bp_left;
+  BlockParameters* bp;
+  TileScratchBuffer* const scratch_buffer;
+  ResidualPtr* const residual;
+  BlockCdfContext* const top_context;
+  const int top_context_index;
+  const int left_context_index;
+};
+
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+extern template bool
+Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_H_
diff --git a/src/tile/bitstream/mode_info.cc b/src/tile/bitstream/mode_info.cc
new file mode 100644
index 0000000..cb7b311
--- /dev/null
+++ b/src/tile/bitstream/mode_info.cc
@@ -0,0 +1,1435 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/segmentation_map.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kDeltaQSmall = 3;
+constexpr int kDeltaLfSmall = 3;
+
+constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
+    0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
+
+constexpr uint8_t kSizeGroup[kMaxBlockSizes] = {
+    0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3};
+
+constexpr int kCompoundModeNewMvContexts = 5;
+constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = {
+    {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}};
+
+enum CflSign : uint8_t {
+  kCflSignZero = 0,
+  kCflSignNegative = 1,
+  kCflSignPositive = 2
+};
+
+// For each possible value of the combined signs (which is read from the
+// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context,
+// alpha_v_context. Only positive entries are used. Entry at index i is computed
+// as follows:
+// sign_u = i / 3
+// sign_v = i % 3
+// alpha_u_context = i - 2
+// alpha_v_context = (sign_v - 1) * 3 + sign_u
+constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = {
+    {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1},
+    {1, 2, 2, 4},  {2, 0, 3, -1}, {2, 1, 4, 2},  {2, 2, 5, 5},
+};
+
+constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv,
+                                                  kPredictionModeNearNearMv,
+                                                  kPredictionModeNearNewMv,
+                                                  kPredictionModeNewNearMv);
+
+constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16,
+                                                  kBlock16x8, kBlock16x16,
+                                                  kBlock16x32, kBlock32x16,
+                                                  kBlock32x32);
+
+bool IsBackwardReference(ReferenceFrameType type) {
+  return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate;
+}
+
+bool IsSameDirectionReferencePair(ReferenceFrameType type1,
+                                  ReferenceFrameType type2) {
+  return (type1 >= kReferenceFrameBackward) ==
+         (type2 >= kReferenceFrameBackward);
+}
+
+// This is called neg_deinterleave() in the spec.
+int DecodeSegmentId(int diff, int reference, int max) {
+  if (reference == 0) return diff;
+  if (reference >= max - 1) return max - diff - 1;
+  const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1)
+                                      : reference - (diff >> 1);
+  const int reference2 = (reference << 1);
+  if (reference2 < max) {
+    return (diff <= reference2) ? value : diff;
+  }
+  return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1);
+}
+
+// This is called DrlCtxStack in section 7.10.2.14 of the spec.
+// In the spec, the weights of all the nearest mvs are incremented by a bonus
+// weight which is larger than any natural weight, and the weights of the mvs
+// are compared with this bonus weight to determine their contexts. We replace
+// this procedure by introducing |nearest_mv_count| in PredictionParameters,
+// which records the count of the nearest mvs. Since all the nearest mvs are in
+// the beginning of the mv stack, the |index| of a mv in the mv stack can be
+// compared with |nearest_mv_count| to get that mv's context.
+int GetRefMvIndexContext(int nearest_mv_count, int index) {
+  if (index + 1 < nearest_mv_count) {
+    return 0;
+  }
+  if (index + 1 == nearest_mv_count) {
+    return 1;
+  }
+  return 2;
+}
+
+// Returns true if both the width and height of the block is less than 64.
+bool IsBlockDimensionLessThan64(BlockSize size) {
+  return size <= kBlock32x32 && size != kBlock16x64;
+}
+
+int GetUseCompoundReferenceContext(const Tile::Block& block) {
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (block.IsTopSingle() && block.IsLeftSingle()) {
+      return static_cast<int>(IsBackwardReference(block.TopReference(0))) ^
+             static_cast<int>(IsBackwardReference(block.LeftReference(0)));
+    }
+    if (block.IsTopSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.TopReference(0)) ||
+                                  block.IsTopIntra());
+    }
+    if (block.IsLeftSingle()) {
+      return 2 + static_cast<int>(IsBackwardReference(block.LeftReference(0)) ||
+                                  block.IsLeftIntra());
+    }
+    return 4;
+  }
+  if (block.top_available[kPlaneY]) {
+    return block.IsTopSingle()
+               ? static_cast<int>(IsBackwardReference(block.TopReference(0)))
+               : 3;
+  }
+  if (block.left_available[kPlaneY]) {
+    return block.IsLeftSingle()
+               ? static_cast<int>(IsBackwardReference(block.LeftReference(0)))
+               : 3;
+  }
+  return 1;
+}
+
+// Calculates count0 by calling block.CountReferences() on the frame types from
+// type0_start to type0_end, inclusive, and summing the results.
+// Calculates count1 by calling block.CountReferences() on the frame types from
+// type1_start to type1_end, inclusive, and summing the results.
+// Compares count0 with count1 and returns 0, 1 or 2.
+//
+// See count_refs and ref_count_ctx in 8.3.2.
+int GetReferenceContext(const Tile::Block& block,
+                        ReferenceFrameType type0_start,
+                        ReferenceFrameType type0_end,
+                        ReferenceFrameType type1_start,
+                        ReferenceFrameType type1_end) {
+  int count0 = 0;
+  int count1 = 0;
+  for (int type = type0_start; type <= type0_end; ++type) {
+    count0 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  for (int type = type1_start; type <= type1_end; ++type) {
+    count1 += block.CountReferences(static_cast<ReferenceFrameType>(type));
+  }
+  return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2);
+}
+
+}  // namespace
+
+bool Tile::ReadSegmentId(const Block& block) {
+  // These two asserts ensure that current_frame_.segmentation_map() is not
+  // nullptr.
+  assert(frame_header_.segmentation.enabled);
+  assert(frame_header_.segmentation.update_map);
+  const SegmentationMap& map = *current_frame_.segmentation_map();
+  int top_left = -1;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    top_left = map.segment_id(block.row4x4 - 1, block.column4x4 - 1);
+  }
+  int top = -1;
+  if (block.top_available[kPlaneY]) {
+    top = map.segment_id(block.row4x4 - 1, block.column4x4);
+  }
+  int left = -1;
+  if (block.left_available[kPlaneY]) {
+    left = map.segment_id(block.row4x4, block.column4x4 - 1);
+  }
+  int pred;
+  if (top == -1) {
+    pred = (left == -1) ? 0 : left;
+  } else if (left == -1) {
+    pred = top;
+  } else {
+    pred = (top_left == top) ? top : left;
+  }
+  BlockParameters& bp = *block.bp;
+  if (bp.skip) {
+    bp.prediction_parameters->segment_id = pred;
+    return true;
+  }
+  int context = 0;
+  if (top_left < 0) {
+    context = 0;
+  } else if (top_left == top && top_left == left) {
+    context = 2;
+  } else if (top_left == top || top_left == left || top == left) {
+    context = 1;
+  }
+  uint16_t* const segment_id_cdf =
+      symbol_decoder_context_.segment_id_cdf[context];
+  const int encoded_segment_id =
+      reader_.ReadSymbol<kMaxSegments>(segment_id_cdf);
+  bp.prediction_parameters->segment_id =
+      DecodeSegmentId(encoded_segment_id, pred,
+                      frame_header_.segmentation.last_active_segment_id + 1);
+  // Check the bitstream conformance requirement in Section 6.10.8 of the spec.
+  if (bp.prediction_parameters->segment_id < 0 ||
+      bp.prediction_parameters->segment_id >
+          frame_header_.segmentation.last_active_segment_id) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d",
+        encoded_segment_id, frame_header_.segmentation.last_active_segment_id,
+        bp.prediction_parameters->segment_id);
+    return false;
+  }
+  return true;
+}
+
+bool Tile::ReadIntraSegmentId(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.prediction_parameters->segment_id = 0;
+    return true;
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadSkip(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip)) {
+    bp.skip = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.bp_top->skip) {
+    ++context;
+  }
+  if (block.left_available[kPlaneY] && block.bp_left->skip) {
+    ++context;
+  }
+  uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context];
+  bp.skip = reader_.ReadSymbol(skip_cdf);
+}
+
+bool Tile::ReadSkipMode(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.skip_mode_present ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv) ||
+      IsBlockDimension4(block.size)) {
+    return false;
+  }
+  const int context =
+      (block.left_available[kPlaneY]
+           ? static_cast<int>(left_context_.skip_mode[block.left_context_index])
+           : 0) +
+      (block.top_available[kPlaneY]
+           ? static_cast<int>(
+                 block.top_context->skip_mode[block.top_context_index])
+           : 0);
+  return reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]);
+}
+
+void Tile::ReadCdef(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (bp.skip || frame_header_.coded_lossless ||
+      !sequence_header_.enable_cdef || frame_header_.allow_intrabc ||
+      frame_header_.cdef.bits == 0) {
+    return;
+  }
+  int8_t* const cdef_index =
+      &cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)];
+  int stride = cdef_index_.columns();
+  if (cdef_index[0] == -1) {
+    cdef_index[0] =
+        static_cast<int8_t>(reader_.ReadLiteral(frame_header_.cdef.bits));
+    if (block.size == kBlock128x128) {
+      // This condition is shorthand for block.width4x4 > 16 && block.height4x4
+      // > 16.
+      cdef_index[1] = cdef_index[0];
+      cdef_index[stride] = cdef_index[0];
+      cdef_index[stride + 1] = cdef_index[0];
+    } else if (block.width4x4 > 16) {
+      cdef_index[1] = cdef_index[0];
+    } else if (block.height4x4 > 16) {
+      cdef_index[stride] = cdef_index[0];
+    }
+  }
+}
+
+int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale,
+                           int min_value, int max_value, int value) {
+  int abs = reader_.ReadSymbol<kDeltaSymbolCount>(cdf);
+  if (abs == delta_small) {
+    const int remaining_bit_count =
+        static_cast<int>(reader_.ReadLiteral(3)) + 1;
+    const int abs_remaining_bits =
+        static_cast<int>(reader_.ReadLiteral(remaining_bit_count));
+    abs = abs_remaining_bits + (1 << remaining_bit_count) + 1;
+  }
+  if (abs != 0) {
+    const bool sign = reader_.ReadBit() != 0;
+    const int scaled_abs = abs << scale;
+    const int reduced_delta = sign ? -scaled_abs : scaled_abs;
+    value += reduced_delta;
+    value = Clip3(value, min_value, max_value);
+  }
+  return value;
+}
+
+void Tile::ReadQuantizerIndexDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if ((block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  current_quantizer_index_ =
+      ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall,
+                       frame_header_.delta_q.scale, kMinLossyQuantizer,
+                       kMaxQuantizer, current_quantizer_index_);
+}
+
+void Tile::ReadLoopFilterDelta(const Block& block) {
+  assert(read_deltas_);
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.delta_lf.present ||
+      (block.size == SuperBlockSize() && bp.skip)) {
+    return;
+  }
+  int frame_lf_count = 1;
+  if (frame_header_.delta_lf.multi) {
+    frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2);
+  }
+  bool recompute_deblock_filter_levels = false;
+  for (int i = 0; i < frame_lf_count; ++i) {
+    uint16_t* const delta_lf_abs_cdf =
+        frame_header_.delta_lf.multi
+            ? symbol_decoder_context_.delta_lf_multi_cdf[i]
+            : symbol_decoder_context_.delta_lf_cdf;
+    const int8_t old_delta_lf = delta_lf_[i];
+    delta_lf_[i] = ReadAndClipDelta(
+        delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale,
+        -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]);
+    recompute_deblock_filter_levels =
+        recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]);
+  }
+  delta_lf_all_zero_ =
+      (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0;
+  if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) {
+    post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_);
+  }
+}
+
+void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) {
+  uint16_t* cdf;
+  if (intra_y_mode) {
+    const PredictionMode top_mode =
+        block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc;
+    const PredictionMode left_mode = block.left_available[kPlaneY]
+                                         ? block.bp_left->y_mode
+                                         : kPredictionModeDc;
+    const int top_context = kIntraYModeContext[top_mode];
+    const int left_context = kIntraYModeContext[left_mode];
+    cdf = symbol_decoder_context_
+              .intra_frame_y_mode_cdf[top_context][left_context];
+  } else {
+    cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]];
+  }
+  block.bp->y_mode = static_cast<PredictionMode>(
+      reader_.ReadSymbol<kIntraPredictionModesY>(cdf));
+}
+
+void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.angle_delta[plane_type] = 0;
+  const PredictionMode mode = (plane_type == kPlaneTypeY)
+                                  ? bp.y_mode
+                                  : bp.prediction_parameters->uv_mode;
+  if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return;
+  uint16_t* const cdf =
+      symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical];
+  prediction_parameters.angle_delta[plane_type] =
+      reader_.ReadSymbol<kAngleDeltaSymbolCount>(cdf);
+  prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta;
+}
+
+void Tile::ReadCflAlpha(const Block& block) {
+  const int signs = reader_.ReadSymbol<kCflAlphaSignsSymbolCount>(
+      symbol_decoder_context_.cfl_alpha_signs_cdf);
+  const int8_t* const cfl_lookup = kCflAlphaLookup[signs];
+  const auto sign_u = static_cast<CflSign>(cfl_lookup[0]);
+  const auto sign_v = static_cast<CflSign>(cfl_lookup[1]);
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.cfl_alpha_u = 0;
+  if (sign_u != kCflSignZero) {
+    assert(cfl_lookup[2] >= 0);
+    prediction_parameters.cfl_alpha_u =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) +
+        1;
+    if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1;
+  }
+  prediction_parameters.cfl_alpha_v = 0;
+  if (sign_v != kCflSignZero) {
+    assert(cfl_lookup[3] >= 0);
+    prediction_parameters.cfl_alpha_v =
+        reader_.ReadSymbol<kCflAlphaSymbolCount>(
+            symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) +
+        1;
+    if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1;
+  }
+}
+
+void Tile::ReadPredictionModeUV(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bool chroma_from_luma_allowed;
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id]) {
+    chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4;
+  } else {
+    chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size);
+  }
+  uint16_t* const cdf =
+      symbol_decoder_context_
+          .uv_mode_cdf[static_cast<int>(chroma_from_luma_allowed)][bp.y_mode];
+  if (chroma_from_luma_allowed) {
+    bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV>(cdf));
+  } else {
+    bp.prediction_parameters->uv_mode = static_cast<PredictionMode>(
+        reader_.ReadSymbol<kIntraPredictionModesUV - 1>(cdf));
+  }
+}
+
+int Tile::ReadMotionVectorComponent(const Block& block, const int component) {
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const bool sign = reader_.ReadSymbol(
+      symbol_decoder_context_.mv_sign_cdf[component][context]);
+  const int mv_class = reader_.ReadSymbol<kMvClassSymbolCount>(
+      symbol_decoder_context_.mv_class_cdf[component][context]);
+  int magnitude = 1;
+  int value;
+  uint16_t* fraction_cdf;
+  uint16_t* precision_cdf;
+  if (mv_class == 0) {
+    value = static_cast<int>(reader_.ReadSymbol(
+        symbol_decoder_context_.mv_class0_bit_cdf[component][context]));
+    fraction_cdf = symbol_decoder_context_
+                       .mv_class0_fraction_cdf[component][context][value];
+    precision_cdf = symbol_decoder_context_
+                        .mv_class0_high_precision_cdf[component][context];
+  } else {
+    assert(mv_class <= kMvBitSymbolCount);
+    value = 0;
+    for (int i = 0; i < mv_class; ++i) {
+      const int bit = static_cast<int>(reader_.ReadSymbol(
+          symbol_decoder_context_.mv_bit_cdf[component][context][i]));
+      value |= bit << i;
+    }
+    magnitude += 2 << (mv_class + 2);
+    fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context];
+    precision_cdf =
+        symbol_decoder_context_.mv_high_precision_cdf[component][context];
+  }
+  const int fraction =
+      (frame_header_.force_integer_mv == 0)
+          ? reader_.ReadSymbol<kMvFractionSymbolCount>(fraction_cdf)
+          : 3;
+  const int precision =
+      frame_header_.allow_high_precision_mv
+          ? static_cast<int>(reader_.ReadSymbol(precision_cdf))
+          : 1;
+  magnitude += (value << 3) | (fraction << 1) | precision;
+  return sign ? -magnitude : magnitude;
+}
+
+void Tile::ReadMotionVector(const Block& block, int index) {
+  BlockParameters& bp = *block.bp;
+  const int context =
+      static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
+  const auto mv_joint =
+      static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+          symbol_decoder_context_.mv_joint_cdf[context]));
+  if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
+  }
+  if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero ||
+      mv_joint == kMvJointTypeNonZero) {
+    bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1);
+  }
+}
+
+void Tile::ReadFilterIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_filter_intra = false;
+  if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc ||
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] != 0 ||
+      !IsBlockDimensionLessThan64(block.size)) {
+    return;
+  }
+  prediction_parameters.use_filter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.use_filter_intra_cdf[block.size]);
+  if (prediction_parameters.use_filter_intra) {
+    prediction_parameters.filter_intra_mode = static_cast<FilterIntraPredictor>(
+        reader_.ReadSymbol<kNumFilterIntraPredictors>(
+            symbol_decoder_context_.filter_intra_mode_cdf));
+  }
+}
+
+bool Tile::DecodeIntraModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.skip = false;
+  if (frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  SetCdfContextSkipMode(block, false);
+  ReadSkip(block);
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadIntraSegmentId(block)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.use_intra_block_copy = false;
+  if (frame_header_.allow_intrabc) {
+    prediction_parameters.use_intra_block_copy =
+        reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf);
+  }
+  if (prediction_parameters.use_intra_block_copy) {
+    bp.is_inter = true;
+    bp.reference_frame[0] = kReferenceFrameIntra;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    bp.y_mode = kPredictionModeDc;
+    bp.prediction_parameters->uv_mode = kPredictionModeDc;
+    SetCdfContextUVMode(block);
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+    bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+    SetCdfContextPaletteSize(block);
+    bp.interpolation_filter[0] = kInterpolationFilterBilinear;
+    bp.interpolation_filter[1] = kInterpolationFilterBilinear;
+    MvContexts dummy_mode_contexts;
+    FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts);
+    return AssignIntraMv(block);
+  }
+  bp.is_inter = false;
+  return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true);
+}
+
+int8_t Tile::ComputePredictedSegmentId(const Block& block) const {
+  // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation
+  // map containing all 0s.
+  if (prev_segment_ids_ == nullptr) return 0;
+
+  const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4,
+                               static_cast<int>(block.width4x4));
+  const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4,
+                               static_cast<int>(block.height4x4));
+  int8_t id = 7;
+  for (int y = 0; y < y_limit; ++y) {
+    for (int x = 0; x < x_limit; ++x) {
+      const int8_t prev_segment_id =
+          prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x);
+      id = std::min(id, prev_segment_id);
+    }
+  }
+  return id;
+}
+
+void Tile::SetCdfContextUsePredictedSegmentId(const Block& block,
+                                              bool use_predicted_segment_id) {
+  memset(left_context_.use_predicted_segment_id + block.left_context_index,
+         static_cast<int>(use_predicted_segment_id), block.height4x4);
+  memset(block.top_context->use_predicted_segment_id + block.top_context_index,
+         static_cast<int>(use_predicted_segment_id), block.width4x4);
+}
+
+bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) {
+  BlockParameters& bp = *block.bp;
+  if (!frame_header_.segmentation.enabled) {
+    bp.prediction_parameters->segment_id = 0;
+    return true;
+  }
+  if (!frame_header_.segmentation.update_map) {
+    bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+    return true;
+  }
+  if (pre_skip) {
+    if (!frame_header_.segmentation.segment_id_pre_skip) {
+      bp.prediction_parameters->segment_id = 0;
+      return true;
+    }
+  } else if (bp.skip) {
+    SetCdfContextUsePredictedSegmentId(block, false);
+    return ReadSegmentId(block);
+  }
+  if (frame_header_.segmentation.temporal_update) {
+    const int context =
+        (block.left_available[kPlaneY]
+             ? static_cast<int>(
+                   left_context_
+                       .use_predicted_segment_id[block.left_context_index])
+             : 0) +
+        (block.top_available[kPlaneY]
+             ? static_cast<int>(
+                   block.top_context
+                       ->use_predicted_segment_id[block.top_context_index])
+             : 0);
+    const bool use_predicted_segment_id = reader_.ReadSymbol(
+        symbol_decoder_context_.use_predicted_segment_id_cdf[context]);
+    SetCdfContextUsePredictedSegmentId(block, use_predicted_segment_id);
+    if (use_predicted_segment_id) {
+      bp.prediction_parameters->segment_id = ComputePredictedSegmentId(block);
+      return true;
+    }
+  }
+  return ReadSegmentId(block);
+}
+
+void Tile::ReadIsInter(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.is_inter = true;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame)) {
+    bp.is_inter = frame_header_.segmentation
+                      .feature_data[bp.prediction_parameters->segment_id]
+                                   [kSegmentFeatureReferenceFrame] !=
+                  kReferenceFrameIntra;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.is_inter = true;
+    return;
+  }
+  int context = 0;
+  if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    context = (block.IsTopIntra() && block.IsLeftIntra())
+                  ? 3
+                  : static_cast<int>(block.IsTopIntra() || block.IsLeftIntra());
+  } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) {
+    context = 2 * static_cast<int>(block.top_available[kPlaneY]
+                                       ? block.IsTopIntra()
+                                       : block.IsLeftIntra());
+  }
+  bp.is_inter =
+      reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]);
+}
+
+void Tile::SetCdfContextPaletteSize(const Block& block) {
+  const PaletteModeInfo& palette_mode_info =
+      block.bp->prediction_parameters->palette_mode_info;
+  for (int plane_type = kPlaneTypeY; plane_type <= kPlaneTypeUV; ++plane_type) {
+    memset(left_context_.palette_size[plane_type] + block.left_context_index,
+           palette_mode_info.size[plane_type], block.height4x4);
+    memset(
+        block.top_context->palette_size[plane_type] + block.top_context_index,
+        palette_mode_info.size[plane_type], block.width4x4);
+    if (palette_mode_info.size[plane_type] == 0) continue;
+    for (int i = block.left_context_index;
+         i < block.left_context_index + block.height4x4; ++i) {
+      memcpy(left_context_.palette_color[i][plane_type],
+             palette_mode_info.color[plane_type],
+             kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+    }
+    for (int i = block.top_context_index;
+         i < block.top_context_index + block.width4x4; ++i) {
+      memcpy(block.top_context->palette_color[i][plane_type],
+             palette_mode_info.color[plane_type],
+             kMaxPaletteSize * sizeof(palette_mode_info.color[0][0]));
+    }
+  }
+}
+
+void Tile::SetCdfContextUVMode(const Block& block) {
+  // BlockCdfContext.uv_mode is only used to compute is_smooth_prediction for
+  // the intra edge upsamplers in the subsequent blocks. They have some special
+  // rules for subsampled UV planes. For subsampled UV planes, update left
+  // context only if current block contains the last odd column and update top
+  // context only if current block contains the last odd row.
+  if (subsampling_x_[kPlaneU] == 0 || (block.column4x4 & 1) == 1 ||
+      block.width4x4 > 1) {
+    memset(left_context_.uv_mode + block.left_context_index,
+           block.bp->prediction_parameters->uv_mode, block.height4x4);
+  }
+  if (subsampling_y_[kPlaneU] == 0 || (block.row4x4 & 1) == 1 ||
+      block.height4x4 > 1) {
+    memset(block.top_context->uv_mode + block.top_context_index,
+           block.bp->prediction_parameters->uv_mode, block.width4x4);
+  }
+}
+
+bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) {
+  BlockParameters& bp = *block.bp;
+  bp.reference_frame[0] = kReferenceFrameIntra;
+  bp.reference_frame[1] = kReferenceFrameNone;
+  ReadPredictionModeY(block, intra_y_mode);
+  ReadIntraAngleInfo(block, kPlaneTypeY);
+  if (block.HasChroma()) {
+    ReadPredictionModeUV(block);
+    if (bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+      ReadCflAlpha(block);
+    }
+    if (block.left_available[kPlaneU]) {
+      const int smooth_row =
+          block.row4x4 + (~block.row4x4 & subsampling_y_[kPlaneU]);
+      const int smooth_column =
+          block.column4x4 - 1 - (block.column4x4 & subsampling_x_[kPlaneU]);
+      const BlockParameters& bp_left =
+          *block_parameters_holder_.Find(smooth_row, smooth_column);
+      bp.prediction_parameters->chroma_left_uses_smooth_prediction =
+          (bp_left.reference_frame[0] <= kReferenceFrameIntra) &&
+          kPredictionModeSmoothMask.Contains(
+              left_context_.uv_mode[CdfContextIndex(smooth_row)]);
+    }
+    if (block.top_available[kPlaneU]) {
+      const int smooth_row =
+          block.row4x4 - 1 - (block.row4x4 & subsampling_y_[kPlaneU]);
+      const int smooth_column =
+          block.column4x4 + (~block.column4x4 & subsampling_x_[kPlaneU]);
+      const BlockParameters& bp_top =
+          *block_parameters_holder_.Find(smooth_row, smooth_column);
+      bp.prediction_parameters->chroma_top_uses_smooth_prediction =
+          (bp_top.reference_frame[0] <= kReferenceFrameIntra) &&
+          kPredictionModeSmoothMask.Contains(
+              top_context_.get()[SuperBlockColumnIndex(smooth_column)]
+                  .uv_mode[CdfContextIndex(smooth_column)]);
+    }
+    SetCdfContextUVMode(block);
+    ReadIntraAngleInfo(block, kPlaneTypeUV);
+  }
+  ReadPaletteModeInfo(block);
+  SetCdfContextPaletteSize(block);
+  ReadFilterIntraModeInfo(block);
+  return true;
+}
+
+CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) {
+  // compound and inter.
+  const bool top_comp_inter = block.top_available[kPlaneY] &&
+                              !block.IsTopIntra() && !block.IsTopSingle();
+  const bool left_comp_inter = block.left_available[kPlaneY] &&
+                               !block.IsLeftIntra() && !block.IsLeftSingle();
+  // unidirectional compound.
+  const bool top_uni_comp =
+      top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0),
+                                                     block.TopReference(1));
+  const bool left_uni_comp =
+      left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0),
+                                                      block.LeftReference(1));
+  int context;
+  if (block.top_available[kPlaneY] && !block.IsTopIntra() &&
+      block.left_available[kPlaneY] && !block.IsLeftIntra()) {
+    const int same_direction = static_cast<int>(IsSameDirectionReferencePair(
+        block.TopReference(0), block.LeftReference(0)));
+    if (!top_comp_inter && !left_comp_inter) {
+      context = 1 + MultiplyBy2(same_direction);
+    } else if (!top_comp_inter) {
+      context = left_uni_comp ? 3 + same_direction : 1;
+    } else if (!left_comp_inter) {
+      context = top_uni_comp ? 3 + same_direction : 1;
+    } else {
+      if (!top_uni_comp && !left_uni_comp) {
+        context = 0;
+      } else if (!top_uni_comp || !left_uni_comp) {
+        context = 2;
+      } else {
+        context = 3 + static_cast<int>(
+                          (block.TopReference(0) == kReferenceFrameBackward) ==
+                          (block.LeftReference(0) == kReferenceFrameBackward));
+      }
+    }
+  } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) {
+    if (top_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(top_uni_comp));
+    } else if (left_comp_inter) {
+      context = 1 + MultiplyBy2(static_cast<int>(left_uni_comp));
+    } else {
+      context = 2;
+    }
+  } else if (top_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(top_uni_comp));
+  } else if (left_comp_inter) {
+    context = MultiplyBy4(static_cast<int>(left_uni_comp));
+  } else {
+    context = 2;
+  }
+  return static_cast<CompoundReferenceType>(reader_.ReadSymbol(
+      symbol_decoder_context_.compound_reference_type_cdf[context]));
+}
+
+template <bool is_single, bool is_backward, int index>
+uint16_t* Tile::GetReferenceCdf(
+    const Block& block,
+    CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) {
+  int context = 0;
+  if ((type == kCompoundReferenceUnidirectional && index == 0) ||
+      (is_single && index == 1)) {
+    // uni_comp_ref and single_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden,
+                            kReferenceFrameBackward, kReferenceFrameAlternate);
+  } else if (type == kCompoundReferenceUnidirectional && index == 1) {
+    // uni_comp_ref_p1.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceUnidirectional && index == 2) ||
+             (type == kCompoundReferenceBidirectional && index == 2) ||
+             (is_single && index == 5)) {
+    // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3,
+                            kReferenceFrameGolden, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 0) ||
+             (is_single && index == 3)) {
+    // comp_ref and single_ref_p3.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2,
+                            kReferenceFrameLast3, kReferenceFrameGolden);
+  } else if ((type == kCompoundReferenceBidirectional && index == 1) ||
+             (is_single && index == 4)) {
+    // comp_ref_p1 and single_ref_p4.
+    context =
+        GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast,
+                            kReferenceFrameLast2, kReferenceFrameLast2);
+  } else if ((is_single && index == 2) || (is_backward && index == 0)) {
+    // single_ref_p2 and comp_bwdref.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameAlternate2,
+        kReferenceFrameAlternate, kReferenceFrameAlternate);
+  } else if ((is_single && index == 6) || (is_backward && index == 1)) {
+    // single_ref_p6 and comp_bwdref_p1.
+    context = GetReferenceContext(
+        block, kReferenceFrameBackward, kReferenceFrameBackward,
+        kReferenceFrameAlternate2, kReferenceFrameAlternate2);
+  }
+  if (is_single) {
+    // The index parameter for single references is offset by one since the spec
+    // uses 1-based index for these elements.
+    return symbol_decoder_context_.single_reference_cdf[context][index - 1];
+  }
+  if (is_backward) {
+    return symbol_decoder_context_
+        .compound_backward_reference_cdf[context][index];
+  }
+  return symbol_decoder_context_.compound_reference_cdf[type][context][index];
+}
+
+void Tile::ReadReferenceFrames(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.reference_frame[0] = frame_header_.skip_mode_frame[0];
+    bp.reference_frame[1] = frame_header_.skip_mode_frame[1];
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id,
+          kSegmentFeatureReferenceFrame)) {
+    bp.reference_frame[0] = static_cast<ReferenceFrameType>(
+        frame_header_.segmentation
+            .feature_data[bp.prediction_parameters->segment_id]
+                         [kSegmentFeatureReferenceFrame]);
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.reference_frame[0] = kReferenceFrameLast;
+    bp.reference_frame[1] = kReferenceFrameNone;
+    return;
+  }
+  const bool use_compound_reference =
+      frame_header_.reference_mode_select &&
+      std::min(block.width4x4, block.height4x4) >= 2 &&
+      reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf
+                             [GetUseCompoundReferenceContext(block)]);
+  if (use_compound_reference) {
+    CompoundReferenceType reference_type = ReadCompoundReferenceType(block);
+    if (reference_type == kCompoundReferenceUnidirectional) {
+      // uni_comp_ref.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 0>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameBackward;
+        bp.reference_frame[1] = kReferenceFrameAlternate;
+        return;
+      }
+      // uni_comp_ref_p1.
+      if (!reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameLast2;
+        return;
+      }
+      // uni_comp_ref_p2.
+      if (reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))) {
+        bp.reference_frame[0] = kReferenceFrameLast;
+        bp.reference_frame[1] = kReferenceFrameGolden;
+        return;
+      }
+      bp.reference_frame[0] = kReferenceFrameLast;
+      bp.reference_frame[1] = kReferenceFrameLast3;
+      return;
+    }
+    assert(reference_type == kCompoundReferenceBidirectional);
+    // comp_ref.
+    if (reader_.ReadSymbol(
+            GetReferenceCdf<false, false, 0>(block, reference_type))) {
+      // comp_ref_p2.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 2>(block, reference_type))
+              ? kReferenceFrameGolden
+              : kReferenceFrameLast3;
+    } else {
+      // comp_ref_p1.
+      bp.reference_frame[0] =
+          reader_.ReadSymbol(
+              GetReferenceCdf<false, false, 1>(block, reference_type))
+              ? kReferenceFrameLast2
+              : kReferenceFrameLast;
+    }
+    // comp_bwdref.
+    if (reader_.ReadSymbol(GetReferenceCdf<false, true, 0>(block))) {
+      bp.reference_frame[1] = kReferenceFrameAlternate;
+    } else {
+      // comp_bwdref_p1.
+      bp.reference_frame[1] =
+          reader_.ReadSymbol(GetReferenceCdf<false, true, 1>(block))
+              ? kReferenceFrameAlternate2
+              : kReferenceFrameBackward;
+    }
+    return;
+  }
+  assert(!use_compound_reference);
+  bp.reference_frame[1] = kReferenceFrameNone;
+  // single_ref_p1.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 1>(block))) {
+    // single_ref_p2.
+    if (reader_.ReadSymbol(GetReferenceCdf<true, false, 2>(block))) {
+      bp.reference_frame[0] = kReferenceFrameAlternate;
+      return;
+    }
+    // single_ref_p6.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 6>(block))
+            ? kReferenceFrameAlternate2
+            : kReferenceFrameBackward;
+    return;
+  }
+  // single_ref_p3.
+  if (reader_.ReadSymbol(GetReferenceCdf<true, false, 3>(block))) {
+    // single_ref_p5.
+    bp.reference_frame[0] =
+        reader_.ReadSymbol(GetReferenceCdf<true, false, 5>(block))
+            ? kReferenceFrameGolden
+            : kReferenceFrameLast3;
+    return;
+  }
+  // single_ref_p4.
+  bp.reference_frame[0] =
+      reader_.ReadSymbol(GetReferenceCdf<true, false, 4>(block))
+          ? kReferenceFrameLast2
+          : kReferenceFrameLast;
+}
+
+void Tile::ReadInterPredictionModeY(const Block& block,
+                                    const MvContexts& mode_contexts,
+                                    bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (skip_mode) {
+    bp.y_mode = kPredictionModeNearestNearestMv;
+    return;
+  }
+  if (frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureSkip) ||
+      frame_header_.segmentation.FeatureActive(
+          bp.prediction_parameters->segment_id, kSegmentFeatureGlobalMv)) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  if (bp.reference_frame[1] > kReferenceFrameIntra) {
+    const int idx0 = mode_contexts.reference_mv >> 1;
+    const int idx1 =
+        std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1);
+    const int context = kCompoundModeContextMap[idx0][idx1];
+    const int offset = reader_.ReadSymbol<kNumCompoundInterPredictionModes>(
+        symbol_decoder_context_.compound_prediction_mode_cdf[context]);
+    bp.y_mode =
+        static_cast<PredictionMode>(kPredictionModeNearestNearestMv + offset);
+    return;
+  }
+  // new_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) {
+    bp.y_mode = kPredictionModeNewMv;
+    return;
+  }
+  // zero_mv.
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) {
+    bp.y_mode = kPredictionModeGlobalMv;
+    return;
+  }
+  // ref_mv.
+  bp.y_mode =
+      reader_.ReadSymbol(
+          symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv])
+          ? kPredictionModeNearMv
+          : kPredictionModeNearestMv;
+}
+
+void Tile::ReadRefMvIndex(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.ref_mv_index = 0;
+  if (bp.y_mode != kPredictionModeNewMv &&
+      bp.y_mode != kPredictionModeNewNewMv &&
+      !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) {
+    return;
+  }
+  const int start =
+      static_cast<int>(kPredictionModeHasNearMvMask.Contains(bp.y_mode));
+  prediction_parameters.ref_mv_index = start;
+  for (int i = start; i < start + 2; ++i) {
+    if (prediction_parameters.ref_mv_count <= i + 1) break;
+    // drl_mode in the spec.
+    const bool ref_mv_index_bit = reader_.ReadSymbol(
+        symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext(
+            prediction_parameters.nearest_mv_count, i)]);
+    prediction_parameters.ref_mv_index = i + static_cast<int>(ref_mv_index_bit);
+    if (!ref_mv_index_bit) return;
+  }
+}
+
+void Tile::ReadInterIntraMode(const Block& block, bool is_compound,
+                              bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+  prediction_parameters.is_wedge_inter_intra = false;
+  if (skip_mode || !sequence_header_.enable_interintra_compound ||
+      is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) {
+    return;
+  }
+  // kSizeGroup[block.size] is guaranteed to be non-zero because of the block
+  // size constraint enforced in the above condition.
+  assert(kSizeGroup[block.size] - 1 >= 0);
+  if (!reader_.ReadSymbol(
+          symbol_decoder_context_
+              .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) {
+    prediction_parameters.inter_intra_mode = kNumInterIntraModes;
+    return;
+  }
+  prediction_parameters.inter_intra_mode =
+      static_cast<InterIntraMode>(reader_.ReadSymbol<kNumInterIntraModes>(
+          symbol_decoder_context_
+              .inter_intra_mode_cdf[kSizeGroup[block.size] - 1]));
+  bp.reference_frame[1] = kReferenceFrameIntra;
+  prediction_parameters.angle_delta[kPlaneTypeY] = 0;
+  prediction_parameters.angle_delta[kPlaneTypeUV] = 0;
+  prediction_parameters.use_filter_intra = false;
+  prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol(
+      symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]);
+  if (!prediction_parameters.is_wedge_inter_intra) return;
+  prediction_parameters.wedge_index =
+      reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+          symbol_decoder_context_.wedge_index_cdf[block.size]);
+  prediction_parameters.wedge_sign = 0;
+}
+
+void Tile::ReadMotionMode(const Block& block, bool is_compound,
+                          bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const auto global_motion_type =
+      frame_header_.global_motion[bp.reference_frame[0]].type;
+  if (skip_mode || !frame_header_.is_motion_mode_switchable ||
+      IsBlockDimension4(block.size) ||
+      (frame_header_.force_integer_mv == 0 &&
+       (bp.y_mode == kPredictionModeGlobalMv ||
+        bp.y_mode == kPredictionModeGlobalGlobalMv) &&
+       global_motion_type > kGlobalMotionTransformationTypeTranslation) ||
+      is_compound || bp.reference_frame[1] == kReferenceFrameIntra ||
+      !block.HasOverlappableCandidates()) {
+    prediction_parameters.motion_mode = kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.num_warp_samples = 0;
+  int num_samples_scanned = 0;
+  memset(prediction_parameters.warp_estimate_candidates, 0,
+         sizeof(prediction_parameters.warp_estimate_candidates));
+  FindWarpSamples(block, &prediction_parameters.num_warp_samples,
+                  &num_samples_scanned,
+                  prediction_parameters.warp_estimate_candidates);
+  if (frame_header_.force_integer_mv != 0 ||
+      prediction_parameters.num_warp_samples == 0 ||
+      !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) {
+    prediction_parameters.motion_mode =
+        reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size])
+            ? kMotionModeObmc
+            : kMotionModeSimple;
+    return;
+  }
+  prediction_parameters.motion_mode =
+      static_cast<MotionMode>(reader_.ReadSymbol<kNumMotionModes>(
+          symbol_decoder_context_.motion_mode_cdf[block.size]));
+}
+
+uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) {
+  int context = 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(
+          block.top_context
+              ->is_explicit_compound_type[block.top_context_index]);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(
+          left_context_.is_explicit_compound_type[block.left_context_index]);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      context += 3;
+    }
+  }
+  return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min(
+      context, kIsExplicitCompoundTypeContexts - 1)];
+}
+
+uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  const ReferenceInfo& reference_info = *current_frame_.reference_info();
+  const int forward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]);
+  const int backward =
+      std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]);
+  int context = (forward == backward) ? 3 : 0;
+  if (block.top_available[kPlaneY]) {
+    if (!block.IsTopSingle()) {
+      context += static_cast<int>(
+          block.top_context->is_compound_type_average[block.top_context_index]);
+    } else if (block.TopReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  if (block.left_available[kPlaneY]) {
+    if (!block.IsLeftSingle()) {
+      context += static_cast<int>(
+          left_context_.is_compound_type_average[block.left_context_index]);
+    } else if (block.LeftReference(0) == kReferenceFrameAlternate) {
+      ++context;
+    }
+  }
+  return symbol_decoder_context_.is_compound_type_average_cdf[context];
+}
+
+void Tile::ReadCompoundType(const Block& block, bool is_compound,
+                            bool skip_mode,
+                            bool* const is_explicit_compound_type,
+                            bool* const is_compound_type_average) {
+  *is_explicit_compound_type = false;
+  *is_compound_type_average = true;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  if (skip_mode) {
+    prediction_parameters.compound_prediction_type =
+        kCompoundPredictionTypeAverage;
+    return;
+  }
+  if (is_compound) {
+    if (sequence_header_.enable_masked_compound) {
+      *is_explicit_compound_type =
+          reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block));
+    }
+    if (*is_explicit_compound_type) {
+      if (kIsWedgeCompoundModeAllowed.Contains(block.size)) {
+        // Only kCompoundPredictionTypeWedge and
+        // kCompoundPredictionTypeDiffWeighted are signaled explicitly.
+        prediction_parameters.compound_prediction_type =
+            static_cast<CompoundPredictionType>(reader_.ReadSymbol(
+                symbol_decoder_context_.compound_type_cdf[block.size]));
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeDiffWeighted;
+      }
+    } else {
+      if (sequence_header_.enable_jnt_comp) {
+        *is_compound_type_average =
+            reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block));
+        prediction_parameters.compound_prediction_type =
+            *is_compound_type_average ? kCompoundPredictionTypeAverage
+                                      : kCompoundPredictionTypeDistance;
+      } else {
+        prediction_parameters.compound_prediction_type =
+            kCompoundPredictionTypeAverage;
+        return;
+      }
+    }
+    if (prediction_parameters.compound_prediction_type ==
+        kCompoundPredictionTypeWedge) {
+      prediction_parameters.wedge_index =
+          reader_.ReadSymbol<kWedgeIndexSymbolCount>(
+              symbol_decoder_context_.wedge_index_cdf[block.size]);
+      prediction_parameters.wedge_sign = static_cast<int>(reader_.ReadBit());
+    } else if (prediction_parameters.compound_prediction_type ==
+               kCompoundPredictionTypeDiffWeighted) {
+      prediction_parameters.mask_is_inverse = reader_.ReadBit() != 0;
+    }
+    return;
+  }
+  if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) {
+    prediction_parameters.compound_prediction_type =
+        prediction_parameters.is_wedge_inter_intra
+            ? kCompoundPredictionTypeWedge
+            : kCompoundPredictionTypeIntra;
+    return;
+  }
+  prediction_parameters.compound_prediction_type =
+      kCompoundPredictionTypeAverage;
+}
+
+uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) {
+  const BlockParameters& bp = *block.bp;
+  int context = MultiplyBy8(direction) +
+                MultiplyBy4(static_cast<int>(bp.reference_frame[1] >
+                                             kReferenceFrameIntra));
+  int top_type = kNumExplicitInterpolationFilters;
+  if (block.top_available[kPlaneY]) {
+    if (block.bp_top->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_top->reference_frame[1] == bp.reference_frame[0]) {
+      top_type = block.bp_top->interpolation_filter[direction];
+    }
+  }
+  int left_type = kNumExplicitInterpolationFilters;
+  if (block.left_available[kPlaneY]) {
+    if (block.bp_left->reference_frame[0] == bp.reference_frame[0] ||
+        block.bp_left->reference_frame[1] == bp.reference_frame[0]) {
+      left_type = block.bp_left->interpolation_filter[direction];
+    }
+  }
+  if (left_type == top_type) {
+    context += left_type;
+  } else if (left_type == kNumExplicitInterpolationFilters) {
+    context += top_type;
+  } else if (top_type == kNumExplicitInterpolationFilters) {
+    context += left_type;
+  } else {
+    context += kNumExplicitInterpolationFilters;
+  }
+  return symbol_decoder_context_.interpolation_filter_cdf[context];
+}
+
+void Tile::ReadInterpolationFilter(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) {
+    static_assert(
+        sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) ==
+            2,
+        "Interpolation filter array size is not 2");
+    for (auto& interpolation_filter : bp.interpolation_filter) {
+      interpolation_filter = frame_header_.interpolation_filter;
+    }
+    return;
+  }
+  bool interpolation_filter_present = true;
+  if (skip_mode ||
+      block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) {
+    interpolation_filter_present = false;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+        kGlobalMotionTransformationTypeTranslation;
+  } else if (!IsBlockDimension4(block.size) &&
+             bp.y_mode == kPredictionModeGlobalGlobalMv) {
+    interpolation_filter_present =
+        frame_header_.global_motion[bp.reference_frame[0]].type ==
+            kGlobalMotionTransformationTypeTranslation ||
+        frame_header_.global_motion[bp.reference_frame[1]].type ==
+            kGlobalMotionTransformationTypeTranslation;
+  }
+  for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) {
+    bp.interpolation_filter[i] =
+        interpolation_filter_present
+            ? static_cast<InterpolationFilter>(
+                  reader_.ReadSymbol<kNumExplicitInterpolationFilters>(
+                      GetInterpolationFilterCdf(block, i)))
+            : kInterpolationFilterEightTap;
+  }
+  if (!sequence_header_.enable_dual_filter) {
+    bp.interpolation_filter[1] = bp.interpolation_filter[0];
+  }
+}
+
+void Tile::SetCdfContextCompoundType(const Block& block,
+                                     bool is_explicit_compound_type,
+                                     bool is_compound_type_average) {
+  memset(left_context_.is_explicit_compound_type + block.left_context_index,
+         static_cast<int>(is_explicit_compound_type), block.height4x4);
+  memset(left_context_.is_compound_type_average + block.left_context_index,
+         static_cast<int>(is_compound_type_average), block.height4x4);
+  memset(block.top_context->is_explicit_compound_type + block.top_context_index,
+         static_cast<int>(is_explicit_compound_type), block.width4x4);
+  memset(block.top_context->is_compound_type_average + block.top_context_index,
+         static_cast<int>(is_compound_type_average), block.width4x4);
+}
+
+bool Tile::ReadInterBlockModeInfo(const Block& block, bool skip_mode) {
+  BlockParameters& bp = *block.bp;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+  SetCdfContextPaletteSize(block);
+  ReadReferenceFrames(block, skip_mode);
+  const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra;
+  MvContexts mode_contexts;
+  FindMvStack(block, is_compound, &mode_contexts);
+  ReadInterPredictionModeY(block, mode_contexts, skip_mode);
+  ReadRefMvIndex(block);
+  if (!AssignInterMv(block, is_compound)) return false;
+  ReadInterIntraMode(block, is_compound, skip_mode);
+  ReadMotionMode(block, is_compound, skip_mode);
+  bool is_explicit_compound_type;
+  bool is_compound_type_average;
+  ReadCompoundType(block, is_compound, skip_mode, &is_explicit_compound_type,
+                   &is_compound_type_average);
+  SetCdfContextCompoundType(block, is_explicit_compound_type,
+                            is_compound_type_average);
+  ReadInterpolationFilter(block, skip_mode);
+  return true;
+}
+
+void Tile::SetCdfContextSkipMode(const Block& block, bool skip_mode) {
+  memset(left_context_.skip_mode + block.left_context_index,
+         static_cast<int>(skip_mode), block.height4x4);
+  memset(block.top_context->skip_mode + block.top_context_index,
+         static_cast<int>(skip_mode), block.width4x4);
+}
+
+bool Tile::DecodeInterModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  block.bp->prediction_parameters->use_intra_block_copy = false;
+  bp.skip = false;
+  if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false;
+  bool skip_mode = ReadSkipMode(block);
+  SetCdfContextSkipMode(block, skip_mode);
+  if (skip_mode) {
+    bp.skip = true;
+  } else {
+    ReadSkip(block);
+  }
+  if (!frame_header_.segmentation.segment_id_pre_skip &&
+      !ReadInterSegmentId(block, /*pre_skip=*/false)) {
+    return false;
+  }
+  ReadCdef(block);
+  if (read_deltas_) {
+    ReadQuantizerIndexDelta(block);
+    ReadLoopFilterDelta(block);
+    read_deltas_ = false;
+  }
+  ReadIsInter(block, skip_mode);
+  return bp.is_inter ? ReadInterBlockModeInfo(block, skip_mode)
+                     : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false);
+}
+
+bool Tile::DecodeModeInfo(const Block& block) {
+  return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block)
+                                                : DecodeInterModeInfo(block);
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc
new file mode 100644
index 0000000..27e5110
--- /dev/null
+++ b/src/tile/bitstream/palette.cc
@@ -0,0 +1,329 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+int Tile::GetPaletteCache(const Block& block, PlaneType plane_type,
+                          uint16_t* const cache) {
+  const int top_size =
+      (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0)
+          ? block.top_context->palette_size[plane_type][block.top_context_index]
+          : 0;
+  const int left_size =
+      block.left_available[kPlaneY]
+          ? left_context_.palette_size[plane_type][block.left_context_index]
+          : 0;
+  if (left_size == 0 && top_size == 0) return 0;
+  // Merge the left and top colors in sorted order and store them in |cache|.
+  uint16_t empty_palette[1];
+  const uint16_t* top =
+      (top_size > 0) ? block.top_context
+                           ->palette_color[block.top_context_index][plane_type]
+                     : empty_palette;
+  const uint16_t* left =
+      (left_size > 0)
+          ? left_context_.palette_color[block.left_context_index][plane_type]
+          : empty_palette;
+  std::merge(top, top + top_size, left, left + left_size, cache);
+  // Deduplicate the entries in |cache| and return the number of unique
+  // entries.
+  return static_cast<int>(
+      std::distance(cache, std::unique(cache, cache + left_size + top_size)));
+}
+
+void Tile::ReadPaletteColors(const Block& block, Plane plane) {
+  const PlaneType plane_type = GetPlaneType(plane);
+  uint16_t cache[2 * kMaxPaletteSize];
+  const int n = GetPaletteCache(block, plane_type, cache);
+  BlockParameters& bp = *block.bp;
+  const uint8_t palette_size =
+      bp.prediction_parameters->palette_mode_info.size[plane_type];
+  uint16_t* const palette_color =
+      bp.prediction_parameters->palette_mode_info.color[plane];
+  const int8_t bitdepth = sequence_header_.color_config.bitdepth;
+  int index = 0;
+  for (int i = 0; i < n && index < palette_size; ++i) {
+    if (reader_.ReadBit() != 0) {  // use_palette_color_cache.
+      palette_color[index++] = cache[i];
+    }
+  }
+  const int merge_pivot = index;
+  if (index < palette_size) {
+    palette_color[index++] =
+        static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+  }
+  const int max_value = (1 << bitdepth) - 1;
+  if (index < palette_size) {
+    int bits = bitdepth - 3 + static_cast<int>(reader_.ReadLiteral(2));
+    do {
+      const int delta = static_cast<int>(reader_.ReadLiteral(bits)) +
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      palette_color[index] =
+          std::min(palette_color[index - 1] + delta, max_value);
+      if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >=
+          max_value) {
+        // Once the color exceeds max_value, all others can be set to max_value
+        // (since they are computed as a delta on top of the current color and
+        // then clipped).
+        Memset(&palette_color[index + 1], max_value, palette_size - index - 1);
+        break;
+      }
+      const int range = (1 << bitdepth) - palette_color[index] -
+                        (plane_type == kPlaneTypeY ? 1 : 0);
+      bits = std::min(bits, CeilLog2(range));
+    } while (++index < palette_size);
+  }
+  // Palette colors are generated using two ascending arrays. So sorting them is
+  // simply a matter of merging the two sorted portions of the array.
+  std::inplace_merge(palette_color, palette_color + merge_pivot,
+                     palette_color + palette_size);
+  if (plane_type == kPlaneTypeUV) {
+    uint16_t* const palette_color_v =
+        bp.prediction_parameters->palette_mode_info.color[kPlaneV];
+    if (reader_.ReadBit() != 0) {  // delta_encode_palette_colors_v.
+      const int bits = bitdepth - 4 + static_cast<int>(reader_.ReadLiteral(2));
+      palette_color_v[0] = reader_.ReadLiteral(bitdepth);
+      for (int i = 1; i < palette_size; ++i) {
+        int delta = static_cast<int>(reader_.ReadLiteral(bits));
+        if (delta != 0 && reader_.ReadBit() != 0) delta = -delta;
+        // This line is equivalent to the following lines in the spec:
+        // val = palette_colors_v[ idx - 1 ] + palette_delta_v
+        // if ( val < 0 ) val += maxVal
+        // if ( val >= maxVal ) val -= maxVal
+        // palette_colors_v[ idx ] = Clip1( val )
+        //
+        // The difference is that in the code, max_value is (1 << bitdepth) - 1.
+        // So "& max_value" has the desired effect of computing both the "if"
+        // conditions and the Clip.
+        palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value;
+      }
+    } else {
+      for (int i = 0; i < palette_size; ++i) {
+        palette_color_v[i] =
+            static_cast<uint16_t>(reader_.ReadLiteral(bitdepth));
+      }
+    }
+  }
+}
+
+void Tile::ReadPaletteModeInfo(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] = 0;
+  if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
+      !frame_header_.allow_screen_content_tools) {
+    return;
+  }
+  const int block_size_context =
+      k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2;
+  if (bp.y_mode == kPredictionModeDc) {
+    const int context =
+        static_cast<int>(
+            block.top_available[kPlaneY] &&
+            block.top_context
+                    ->palette_size[kPlaneTypeY][block.top_context_index] > 0) +
+        static_cast<int>(
+            block.left_available[kPlaneY] &&
+            left_context_.palette_size[kPlaneTypeY][block.left_context_index] >
+                0);
+    const bool has_palette_y = reader_.ReadSymbol(
+        symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]);
+    if (has_palette_y) {
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_y_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneY);
+    }
+  }
+  if (block.HasChroma() &&
+      bp.prediction_parameters->uv_mode == kPredictionModeDc) {
+    const int context = static_cast<int>(
+        bp.prediction_parameters->palette_mode_info.size[kPlaneTypeY] > 0);
+    const bool has_palette_uv =
+        reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]);
+    if (has_palette_uv) {
+      bp.prediction_parameters->palette_mode_info.size[kPlaneTypeUV] =
+          kMinPaletteSize +
+          reader_.ReadSymbol<kPaletteSizeSymbolCount>(
+              symbol_decoder_context_.palette_uv_size_cdf[block_size_context]);
+      ReadPaletteColors(block, kPlaneU);
+    }
+  }
+}
+
+void Tile::PopulatePaletteColorContexts(
+    const Block& block, PlaneType plane_type, int i, int start, int end,
+    uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize],
+    uint8_t color_context[kMaxPaletteSquare]) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int column = start, counter = 0; column >= end; --column, ++counter) {
+    const int row = i - column;
+    assert(row > 0 || column > 0);
+    const uint8_t top =
+        (row > 0)
+            ? prediction_parameters.color_index_map[plane_type][row - 1][column]
+            : 0;
+    const uint8_t left =
+        (column > 0)
+            ? prediction_parameters.color_index_map[plane_type][row][column - 1]
+            : 0;
+    uint8_t index_mask;
+    static_assert(kMaxPaletteSize <= 8, "");
+    int index;
+    if (column <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = top;
+      index_mask = 1 << top;
+      index = 1;
+    } else if (row <= 0) {
+      color_context[counter] = 0;
+      color_order[counter][0] = left;
+      index_mask = 1 << left;
+      index = 1;
+    } else {
+      const uint8_t top_left =
+          prediction_parameters
+              .color_index_map[plane_type][row - 1][column - 1];
+      index_mask = (1 << top) | (1 << left) | (1 << top_left);
+      if (top == left && top == top_left) {
+        color_context[counter] = 4;
+        color_order[counter][0] = top;
+        index = 1;
+      } else if (top == left) {
+        color_context[counter] = 3;
+        color_order[counter][0] = top;
+        color_order[counter][1] = top_left;
+        index = 2;
+      } else if (top == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = left;
+        index = 2;
+      } else if (left == top_left) {
+        color_context[counter] = 2;
+        color_order[counter][0] = top_left;
+        color_order[counter][1] = top;
+        index = 2;
+      } else {
+        color_context[counter] = 1;
+        color_order[counter][0] = std::min(top, left);
+        color_order[counter][1] = std::max(top, left);
+        color_order[counter][2] = top_left;
+        index = 3;
+      }
+    }
+    // Even though only the first |palette_size| entries of this array are ever
+    // used, it is faster to populate all 8 because of the vectorization of the
+    // constant sized loop.
+    for (uint8_t j = 0; j < kMaxPaletteSize; ++j) {
+      if (BitMaskSet::MaskContainsValue(index_mask, j)) continue;
+      color_order[counter][index++] = j;
+    }
+  }
+}
+
+bool Tile::ReadPaletteTokens(const Block& block) {
+  const PaletteModeInfo& palette_mode_info =
+      block.bp->prediction_parameters->palette_mode_info;
+  PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  for (int plane_type = kPlaneTypeY;
+       plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV);
+       ++plane_type) {
+    const int palette_size = palette_mode_info.size[plane_type];
+    if (palette_size == 0) continue;
+    int block_height = block.height;
+    int block_width = block.width;
+    int screen_height = std::min(
+        block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4));
+    int screen_width = std::min(
+        block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4));
+    if (plane_type == kPlaneTypeUV) {
+      block_height >>= sequence_header_.color_config.subsampling_y;
+      block_width >>= sequence_header_.color_config.subsampling_x;
+      screen_height >>= sequence_header_.color_config.subsampling_y;
+      screen_width >>= sequence_header_.color_config.subsampling_x;
+      if (block_height < 4) {
+        block_height += 2;
+        screen_height += 2;
+      }
+      if (block_width < 4) {
+        block_width += 2;
+        screen_width += 2;
+      }
+    }
+    if (!prediction_parameters.color_index_map[plane_type].Reset(
+            block_height, block_width, /*zero_initialize=*/false)) {
+      return false;
+    }
+    int first_value = 0;
+    reader_.DecodeUniform(palette_size, &first_value);
+    prediction_parameters.color_index_map[plane_type][0][0] = first_value;
+    for (int i = 1; i < screen_height + screen_width - 1; ++i) {
+      const int start = std::min(i, screen_width - 1);
+      const int end = std::max(0, i - screen_height + 1);
+      uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize];
+      uint8_t color_context[kMaxPaletteSquare];
+      PopulatePaletteColorContexts(block, static_cast<PlaneType>(plane_type), i,
+                                   start, end, color_order, color_context);
+      for (int j = start, counter = 0; j >= end; --j, ++counter) {
+        uint16_t* const cdf =
+            symbol_decoder_context_
+                .palette_color_index_cdf[plane_type]
+                                        [palette_size - kMinPaletteSize]
+                                        [color_context[counter]];
+        const int color_order_index = reader_.ReadSymbol(cdf, palette_size);
+        prediction_parameters.color_index_map[plane_type][i - j][j] =
+            color_order[counter][color_order_index];
+      }
+    }
+    if (screen_width < block_width) {
+      for (int i = 0; i < screen_height; ++i) {
+        memset(
+            &prediction_parameters.color_index_map[plane_type][i][screen_width],
+            prediction_parameters
+                .color_index_map[plane_type][i][screen_width - 1],
+            block_width - screen_width);
+      }
+    }
+    for (int i = screen_height; i < block_height; ++i) {
+      memcpy(
+          prediction_parameters.color_index_map[plane_type][i],
+          prediction_parameters.color_index_map[plane_type][screen_height - 1],
+          block_width);
+    }
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/partition.cc b/src/tile/bitstream/partition.cc
new file mode 100644
index 0000000..f3dbbb0
--- /dev/null
+++ b/src/tile/bitstream/partition.cc
@@ -0,0 +1,148 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf,
+                                           BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  // After canceling out the repeated terms with opposite signs, we have:
+  //   cdf = None - H + V - VLS;
+  //   if (block_size != 128x128) {
+  //     cdf += VRS - H4;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionNone] -
+                 partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionVertical] -
+                 partition_cdf[kPartitionVerticalWithLeftSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionVerticalWithRightSplit] -
+           partition_cdf[kPartitionHorizontal4];
+  }
+  return cdf;
+}
+
+uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf,
+                                         BlockSize block_size) {
+  // The spec computes the cdf value using the following formula (not writing
+  // partition_cdf[] and using short forms for partition names for clarity):
+  //   cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - V4;
+  //   }
+  // V4 is always zero. So, after canceling out the repeated terms with opposite
+  // signs, we have:
+  //   cdf = H + HBS - VRS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4;
+  //   }
+  // VRS is zero for 128x128 blocks. So, further simplifying we have:
+  //   cdf = H + HBS - HTS;
+  //   if (block_size != 128x128) {
+  //     cdf += H4 - VRS;
+  //   }
+  uint16_t cdf = partition_cdf[kPartitionHorizontal] +
+                 partition_cdf[kPartitionHorizontalWithBottomSplit] -
+                 partition_cdf[kPartitionHorizontalWithTopSplit];
+  if (block_size != kBlock128x128) {
+    cdf += partition_cdf[kPartitionHorizontal4] -
+           partition_cdf[kPartitionVerticalWithRightSplit];
+  }
+  return cdf;
+}
+
+}  // namespace
+
+uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4,
+                                BlockSize block_size) {
+  const int block_size_log2 = k4x4WidthLog2[block_size];
+  int top = 0;
+  if (IsTopInside(row4x4)) {
+    top = static_cast<int>(
+        k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4)
+                          ->size] < block_size_log2);
+  }
+  int left = 0;
+  if (IsLeftInside(column4x4)) {
+    left = static_cast<int>(
+        k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1)
+                           ->size] < block_size_log2);
+  }
+  const int context = left * 2 + top;
+  return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context];
+}
+
+bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size,
+                         bool has_rows, bool has_columns,
+                         Partition* const partition) {
+  if (IsBlockSmallerThan8x8(block_size)) {
+    *partition = kPartitionNone;
+    return true;
+  }
+  if (!has_rows && !has_columns) {
+    *partition = kPartitionSplit;
+    return true;
+  }
+  uint16_t* const partition_cdf =
+      GetPartitionCdf(row4x4, column4x4, block_size);
+  if (partition_cdf == nullptr) {
+    return false;
+  }
+  if (has_rows && has_columns) {
+    const int bsize_log2 = k4x4WidthLog2[block_size];
+    // The partition block size should be 8x8 or above.
+    assert(bsize_log2 > 0);
+    if (bsize_log2 == 1) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionSplit + 1>(partition_cdf));
+    } else if (bsize_log2 == 5) {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kPartitionVerticalWithRightSplit + 1>(
+              partition_cdf));
+    } else {
+      *partition = static_cast<Partition>(
+          reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
+    }
+  } else if (has_columns) {
+    const uint16_t cdf =
+        PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionHorizontal;
+  } else {
+    const uint16_t cdf =
+        PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
+    *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
+                                                         : kPartitionVertical;
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/bitstream/transform_size.cc b/src/tile/bitstream/transform_size.cc
new file mode 100644
index 0000000..7197400
--- /dev/null
+++ b/src/tile/bitstream/transform_size.cc
@@ -0,0 +1,222 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/obu_parser.h"
+#include "src/symbol_decoder_context.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/entropy_decoder.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr uint8_t kMaxVariableTransformTreeDepth = 2;
+// Max_Tx_Depth array from section 5.11.5 in the spec with the following
+// modification: If the element is not zero, it is subtracted by one. That is
+// the only way in which this array is being used.
+constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = {
+    0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3};
+
+constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x64,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x64, kTransformSize64x16, kTransformSize64x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+TransformSize GetSquareTransformSize(uint8_t pixels) {
+  switch (pixels) {
+    case 128:
+    case 64:
+      return kTransformSize64x64;
+    case 32:
+      return kTransformSize32x32;
+    case 16:
+      return kTransformSize16x16;
+    case 8:
+      return kTransformSize8x8;
+    default:
+      return kTransformSize4x4;
+  }
+}
+
+}  // namespace
+
+int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4,
+                               bool ignore_skip) {
+  if (row4x4 == block.row4x4) {
+    if (!block.top_available[kPlaneY]) return 64;
+    const BlockParameters& bp_top =
+        *block_parameters_holder_.Find(row4x4 - 1, column4x4);
+    if ((ignore_skip || bp_top.skip) && bp_top.is_inter) {
+      return kBlockWidthPixels[bp_top.size];
+    }
+  }
+  return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]];
+}
+
+int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4,
+                                 bool ignore_skip) {
+  if (column4x4 == block.column4x4) {
+    if (!block.left_available[kPlaneY]) return 64;
+    const BlockParameters& bp_left =
+        *block_parameters_holder_.Find(row4x4, column4x4 - 1);
+    if ((ignore_skip || bp_left.skip) && bp_left.is_inter) {
+      return kBlockHeightPixels[bp_left.size];
+    }
+  }
+  return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]];
+}
+
+TransformSize Tile::ReadFixedTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id]) {
+    return kTransformSize4x4;
+  }
+  const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size];
+  const bool allow_select = !bp.skip || !bp.is_inter;
+  if (block.size == kBlock4x4 || !allow_select ||
+      frame_header_.tx_mode != kTxModeSelect) {
+    return max_rect_tx_size;
+  }
+  const int max_tx_width = kTransformWidth[max_rect_tx_size];
+  const int max_tx_height = kTransformHeight[max_rect_tx_size];
+  const int top_width =
+      block.top_available[kPlaneY]
+          ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const int left_height =
+      block.left_available[kPlaneY]
+          ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true)
+          : 0;
+  const auto context = static_cast<int>(top_width >= max_tx_width) +
+                       static_cast<int>(left_height >= max_tx_height);
+  const int cdf_index = kTxDepthCdfIndex[block.size];
+  uint16_t* const cdf =
+      symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+  const int tx_depth = (cdf_index == 0)
+                           ? static_cast<int>(reader_.ReadSymbol(cdf))
+                           : reader_.ReadSymbol<3>(cdf);
+  assert(tx_depth < 3);
+  TransformSize tx_size = max_rect_tx_size;
+  if (tx_depth == 0) return tx_size;
+  tx_size = kSplitTransformSize[tx_size];
+  if (tx_depth == 1) return tx_size;
+  return kSplitTransformSize[tx_size];
+}
+
+void Tile::ReadVariableTransformTree(const Block& block, int row4x4,
+                                     int column4x4, TransformSize tx_size) {
+  const uint8_t pixels = std::max(block.width, block.height);
+  const TransformSize max_tx_size = GetSquareTransformSize(pixels);
+  const int context_delta = (kNumSquareTransformSizes - 1 -
+                             TransformSizeToSquareTransformIndex(max_tx_size)) *
+                            6;
+
+  // Branching factor is 4 and maximum depth is 2. So the maximum stack size
+  // necessary is (4 - 1) + 4 = 7.
+  Stack<TransformTreeNode, 7> stack;
+  stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int tx_width4x4 = kTransformWidth4x4[node.tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[node.tx_size];
+    if (node.tx_size != kTransformSize4x4 &&
+        node.depth != kMaxVariableTransformTreeDepth) {
+      const auto top =
+          static_cast<int>(GetTopTransformWidth(block, node.y, node.x, false) <
+                           kTransformWidth[node.tx_size]);
+      const auto left = static_cast<int>(
+          GetLeftTransformHeight(block, node.y, node.x, false) <
+          kTransformHeight[node.tx_size]);
+      const int context =
+          static_cast<int>(max_tx_size > kTransformSize8x8 &&
+                           kTransformSizeSquareMax[node.tx_size] !=
+                               max_tx_size) *
+              3 +
+          context_delta + top + left;
+      // tx_split.
+      if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) {
+        const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size];
+        const int step_width4x4 = kTransformWidth4x4[sub_tx_size];
+        const int step_height4x4 = kTransformHeight4x4[sub_tx_size];
+        // The loops have to run in reverse order because we use a stack for
+        // DFS.
+        for (int i = tx_height4x4 - step_height4x4; i >= 0;
+             i -= step_height4x4) {
+          for (int j = tx_width4x4 - step_width4x4; j >= 0;
+               j -= step_width4x4) {
+            if (node.y + i >= frame_header_.rows4x4 ||
+                node.x + j >= frame_header_.columns4x4) {
+              continue;
+            }
+            stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size,
+                                         node.depth + 1));
+          }
+        }
+        continue;
+      }
+    }
+    // tx_split is false.
+    for (int i = 0; i < tx_height4x4; ++i) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size,
+             tx_width4x4);
+    }
+  } while (!stack.Empty());
+}
+
+void Tile::DecodeTransformSize(const Block& block) {
+  BlockParameters& bp = *block.bp;
+  if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 &&
+      bp.is_inter && !bp.skip &&
+      !frame_header_.segmentation
+           .lossless[bp.prediction_parameters->segment_id]) {
+    const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size];
+    const int tx_width4x4 = kTransformWidth4x4[max_tx_size];
+    const int tx_height4x4 = kTransformHeight4x4[max_tx_size];
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4;
+         row += tx_height4x4) {
+      for (int column = block.column4x4;
+           column < block.column4x4 + block.width4x4; column += tx_width4x4) {
+        ReadVariableTransformTree(block, row, column, max_tx_size);
+      }
+    }
+  } else {
+    const TransformSize transform_size = ReadFixedTransformSize(block);
+    for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) {
+      static_assert(sizeof(TransformSize) == 1, "");
+      memset(&inter_transform_sizes_[row][block.column4x4], transform_size,
+             block.width4x4);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc
new file mode 100644
index 0000000..bba5a69
--- /dev/null
+++ b/src/tile/prediction.cc
@@ -0,0 +1,1349 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+
+#include "src/buffer_pool.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/motion_vector.h"
+#include "src/obu_parser.h"
+#include "src/prediction_mask.h"
+#include "src/tile.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+#include "src/warp_prediction.h"
+#include "src/yuv_buffer.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/inter_intra_masks.inc"
+
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
+constexpr int kAngleStep = 3;
+constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
+    0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
+
+// The following modes need both the left_column and top_row for intra
+// prediction. For directional modes left/top requirement is inferred based on
+// the prediction angle. For Dc modes, left/top requirement is inferred based on
+// whether or not left/top is available.
+constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth,
+                                      kPredictionModeSmoothHorizontal,
+                                      kPredictionModeSmoothVertical,
+                                      kPredictionModePaeth);
+
+int16_t GetDirectionalIntraPredictorDerivative(const int angle) {
+  assert(angle >= 3);
+  assert(angle <= 87);
+  return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1];
+}
+
+// Maps the block_size to an index as follows:
+//  kBlock8x8 => 0.
+//  kBlock8x16 => 1.
+//  kBlock8x32 => 2.
+//  kBlock16x8 => 3.
+//  kBlock16x16 => 4.
+//  kBlock16x32 => 5.
+//  kBlock32x8 => 6.
+//  kBlock32x16 => 7.
+//  kBlock32x32 => 8.
+int GetWedgeBlockSizeIndex(BlockSize block_size) {
+  assert(block_size >= kBlock8x8);
+  return block_size - kBlock8x8 - static_cast<int>(block_size >= kBlock16x8) -
+         static_cast<int>(block_size >= kBlock32x8);
+}
+
+// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively.
+int GetInterIntraMaskLookupIndex(int dimension) {
+  assert(dimension == 4 || dimension == 8 || dimension == 16 ||
+         dimension == 32);
+  return FloorLog2(dimension) - 2;
+}
+
+// 7.11.2.9.
+int GetIntraEdgeFilterStrength(int width, int height, int filter_type,
+                               int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  if (filter_type == 0) {
+    if (sum <= 8) {
+      if (delta >= 56) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 40) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 32) return 3;
+      if (delta >= 16) return 2;
+      if (delta >= 8) return 1;
+    } else if (sum <= 32) {
+      if (delta >= 32) return 3;
+      if (delta >= 4) return 2;
+      return 1;
+    } else {
+      return 3;
+    }
+  } else {
+    if (sum <= 8) {
+      if (delta >= 64) return 2;
+      if (delta >= 40) return 1;
+    } else if (sum <= 16) {
+      if (delta >= 48) return 2;
+      if (delta >= 20) return 1;
+    } else if (sum <= 24) {
+      if (delta >= 4) return 3;
+    } else {
+      return 3;
+    }
+  }
+  return 0;
+}
+
+// 7.11.2.10.
+bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) {
+  const int sum = width + height;
+  delta = std::abs(delta);
+  // This function should not be called when the prediction angle is 90 or 180.
+  assert(delta != 0);
+  if (delta >= 40) return false;
+  return (filter_type == 1) ? sum <= 8 : sum <= 16;
+}
+
+constexpr uint8_t kQuantizedDistanceWeight[4][2] = {
+    {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}};
+
+constexpr uint8_t kQuantizedDistanceLookup[4][2] = {
+    {9, 7}, {11, 5}, {12, 4}, {13, 3}};
+
+void GetDistanceWeights(const int distance[2], int weight[2]) {
+  // Note: distance[0] and distance[1] correspond to relative distance
+  // between current frame and reference frame [1] and [0], respectively.
+  const int order = static_cast<int>(distance[0] <= distance[1]);
+  if (distance[0] == 0 || distance[1] == 0) {
+    weight[0] = kQuantizedDistanceLookup[3][order];
+    weight[1] = kQuantizedDistanceLookup[3][1 - order];
+  } else {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      const int weight_0 = kQuantizedDistanceWeight[i][order];
+      const int weight_1 = kQuantizedDistanceWeight[i][1 - order];
+      if (order == 0) {
+        if (distance[0] * weight_0 < distance[1] * weight_1) break;
+      } else {
+        if (distance[0] * weight_0 > distance[1] * weight_1) break;
+      }
+    }
+    weight[0] = kQuantizedDistanceLookup[i][order];
+    weight[1] = kQuantizedDistanceLookup[i][1 - order];
+  }
+}
+
+dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left,
+                                      bool has_top) {
+  if (mode == kPredictionModeDc) {
+    if (has_left && has_top) {
+      return dsp::kIntraPredictorDc;
+    }
+    if (has_left) {
+      return dsp::kIntraPredictorDcLeft;
+    }
+    if (has_top) {
+      return dsp::kIntraPredictorDcTop;
+    }
+    return dsp::kIntraPredictorDcFill;
+  }
+  switch (mode) {
+    case kPredictionModePaeth:
+      return dsp::kIntraPredictorPaeth;
+    case kPredictionModeSmooth:
+      return dsp::kIntraPredictorSmooth;
+    case kPredictionModeSmoothVertical:
+      return dsp::kIntraPredictorSmoothVertical;
+    case kPredictionModeSmoothHorizontal:
+      return dsp::kIntraPredictorSmoothHorizontal;
+    default:
+      return dsp::kNumIntraPredictors;
+  }
+}
+
+uint8_t* GetStartPoint(Array2DView<uint8_t>* const buffer, const int plane,
+                       const int x, const int y, const int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (bitdepth > 8) {
+    Array2DView<uint16_t> buffer16(
+        buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer[plane][0][0]));
+    return reinterpret_cast<uint8_t*>(&buffer16[y][x]);
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  static_cast<void>(bitdepth);
+  return &buffer[plane][y][x];
+}
+
+int GetPixelPositionFromHighScale(int start, int step, int offset) {
+  return (start + step * offset) >> kScaleSubPixelBits;
+}
+
+dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra,
+                                    bool is_wedge_inter_intra,
+                                    int subsampling_x, int subsampling_y) {
+  return (is_inter_intra && !is_wedge_inter_intra)
+             ? dsp.mask_blend[0][/*is_inter_intra=*/true]
+             : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra];
+}
+
+}  // namespace
+
+template <typename Pixel>
+void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y,
+                           bool has_left, bool has_top, bool has_top_right,
+                           bool has_bottom_left, PredictionMode mode,
+                           TransformSize tx_size) {
+  const int width = kTransformWidth[tx_size];
+  const int height = kTransformHeight[tx_size];
+  const int x_shift = subsampling_x_[plane];
+  const int y_shift = subsampling_y_[plane];
+  const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1;
+  const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1;
+  // For performance reasons, do not initialize the following two buffers.
+  alignas(kMaxAlignment) Pixel top_row_data[160];
+  alignas(kMaxAlignment) Pixel left_column_data[160];
+#if LIBGAV1_MSAN
+  if (IsDirectionalMode(mode)) {
+    memset(top_row_data, 0, sizeof(top_row_data));
+    memset(left_column_data, 0, sizeof(left_column_data));
+  }
+#endif
+  // Some predictors use |top_row_data| and |left_column_data| with a negative
+  // offset to access pixels to the top-left of the current block. So have some
+  // space before the arrays to allow populating those without having to move
+  // the rest of the array.
+  Pixel* const top_row = top_row_data + 16;
+  Pixel* const left_column = left_column_data + 16;
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int top_and_left_size = width + height;
+  const bool is_directional_mode = IsDirectionalMode(mode);
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  const bool use_filter_intra =
+      (plane == kPlaneY && prediction_parameters.use_filter_intra);
+  const int prediction_angle =
+      is_directional_mode
+          ? kPredictionModeToAngle[mode] +
+                prediction_parameters.angle_delta[GetPlaneType(plane)] *
+                    kAngleStep
+          : 0;
+  // Directional prediction requires buffers larger than the width or height.
+  const int top_size = is_directional_mode ? top_and_left_size : width;
+  const int left_size = is_directional_mode ? top_and_left_size : height;
+  const int top_right_size =
+      is_directional_mode ? (has_top_right ? 2 : 1) * width : width;
+  const int bottom_left_size =
+      is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height;
+
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                         (is_directional_mode && prediction_angle < 180) ||
+                         (mode == kPredictionModeDc && has_top);
+  const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) ||
+                          (is_directional_mode && prediction_angle > 90) ||
+                          (mode == kPredictionModeDc && has_left);
+
+  const Pixel* top_row_src = buffer[y - 1];
+
+  // Determine if we need to retrieve the top row from
+  // |intra_prediction_buffer_|.
+  if ((needs_top || needs_left) && use_intra_prediction_buffer_) {
+    // Superblock index of block.row4x4. block.row4x4 is always in luma
+    // dimension (no subsampling).
+    const int current_superblock_index =
+        block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4);
+    // Superblock index of y - 1. y is in the plane dimension (chroma planes
+    // could be subsampled).
+    const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) -
+                            subsampling_y_[plane];
+    const int top_row_superblock_index = (y - 1) >> plane_shift;
+    // If the superblock index of y - 1 is not that of the current superblock,
+    // then we will have to retrieve the top row from the
+    // |intra_prediction_buffer_|.
+    if (current_superblock_index != top_row_superblock_index) {
+      top_row_src = reinterpret_cast<const Pixel*>(
+          (*intra_prediction_buffer_)[plane].get());
+    }
+  }
+
+  if (needs_top) {
+    // Compute top_row.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      top_row[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_top && has_left) {
+      Memset(top_row, buffer[y][x - 1], top_size);
+    } else if (!has_top && !has_left) {
+      Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size);
+    } else {
+      const int top_limit = std::min(max_x - x + 1, top_right_size);
+      memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel));
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // top_row_src[top_limit - x + 1] is not allowed when this condition is
+      // false.
+      if (top_size - top_limit > 0) {
+        Memset(top_row + top_limit, top_row_src[top_limit + x - 1],
+               top_size - top_limit);
+      }
+    }
+  }
+  if (needs_left) {
+    // Compute left_column.
+    if (has_top || has_left) {
+      const int left_index = has_left ? x - 1 : x;
+      left_column[-1] =
+          has_top ? top_row_src[left_index] : buffer[y][left_index];
+    } else {
+      left_column[-1] = 1 << (bitdepth - 1);
+    }
+    if (!has_left && has_top) {
+      Memset(left_column, top_row_src[x], left_size);
+    } else if (!has_left && !has_top) {
+      Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size);
+    } else {
+      const int left_limit = std::min(max_y - y + 1, bottom_left_size);
+      for (int i = 0; i < left_limit; ++i) {
+        left_column[i] = buffer[y + i][x - 1];
+      }
+      // Even though it is safe to call Memset with a size of 0, accessing
+      // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is
+      // false.
+      if (left_size - left_limit > 0) {
+        Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1],
+               left_size - left_limit);
+      }
+    }
+  }
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t dest_stride = buffer_[plane].columns();
+  if (use_filter_intra) {
+    dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column,
+                                prediction_parameters.filter_intra_mode, width,
+                                height);
+  } else if (is_directional_mode) {
+    DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left,
+                          needs_top, prediction_angle, width, height, max_x,
+                          max_y, tx_size, top_row, left_column);
+  } else {
+    const dsp::IntraPredictor predictor =
+        GetIntraPredictor(mode, has_left, has_top);
+    assert(predictor != dsp::kNumIntraPredictors);
+    dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row,
+                                              left_column);
+  }
+}
+
+template void Tile::IntraPrediction<uint8_t>(const Block& block, Plane plane,
+                                             int x, int y, bool has_left,
+                                             bool has_top, bool has_top_right,
+                                             bool has_bottom_left,
+                                             PredictionMode mode,
+                                             TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::IntraPrediction<uint16_t>(const Block& block, Plane plane,
+                                              int x, int y, bool has_left,
+                                              bool has_top, bool has_top_right,
+                                              bool has_bottom_left,
+                                              PredictionMode mode,
+                                              TransformSize tx_size);
+#endif
+
+int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const {
+  bool top;
+  bool left;
+  if (plane == kPlaneY) {
+    top = block.top_available[kPlaneY] &&
+          kPredictionModeSmoothMask.Contains(block.bp_top->y_mode);
+    left = block.left_available[kPlaneY] &&
+           kPredictionModeSmoothMask.Contains(block.bp_left->y_mode);
+  } else {
+    top = block.top_available[plane] &&
+          block.bp->prediction_parameters->chroma_top_uses_smooth_prediction;
+    left = block.left_available[plane] &&
+           block.bp->prediction_parameters->chroma_left_uses_smooth_prediction;
+  }
+  return static_cast<int>(top || left);
+}
+
+template <typename Pixel>
+void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y,
+                                 bool has_left, bool has_top, bool needs_left,
+                                 bool needs_top, int prediction_angle,
+                                 int width, int height, int max_x, int max_y,
+                                 TransformSize tx_size, Pixel* const top_row,
+                                 Pixel* const left_column) {
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  Pixel* const dest = &buffer[y][x];
+  const ptrdiff_t stride = buffer_[plane].columns();
+  if (prediction_angle == 90) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical](
+        dest, stride, top_row, left_column);
+    return;
+  }
+  if (prediction_angle == 180) {
+    dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal](
+        dest, stride, top_row, left_column);
+    return;
+  }
+
+  bool upsampled_top = false;
+  bool upsampled_left = false;
+  if (sequence_header_.enable_intra_edge_filter) {
+    const int filter_type = GetIntraEdgeFilterType(block, plane);
+    if (prediction_angle > 90 && prediction_angle < 180 &&
+        (width + height) >= 24) {
+      // 7.11.2.7.
+      left_column[-1] = top_row[-1] = RightShiftWithRounding(
+          left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4);
+    }
+    if (has_top && needs_top) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 90);
+      if (strength > 0) {
+        const int num_pixels = std::min(width, max_x - x + 1) +
+                               ((prediction_angle < 90) ? height : 0) + 1;
+        dsp_.intra_edge_filter(top_row - 1, num_pixels, strength);
+      }
+    }
+    if (has_left && needs_left) {
+      const int strength = GetIntraEdgeFilterStrength(
+          width, height, filter_type, prediction_angle - 180);
+      if (strength > 0) {
+        const int num_pixels = std::min(height, max_y - y + 1) +
+                               ((prediction_angle > 180) ? width : 0) + 1;
+        dsp_.intra_edge_filter(left_column - 1, num_pixels, strength);
+      }
+    }
+    upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type,
+                                          prediction_angle - 90);
+    if (upsampled_top && needs_top) {
+      const int num_pixels = width + ((prediction_angle < 90) ? height : 0);
+      dsp_.intra_edge_upsampler(top_row, num_pixels);
+    }
+    upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type,
+                                           prediction_angle - 180);
+    if (upsampled_left && needs_left) {
+      const int num_pixels = height + ((prediction_angle > 180) ? width : 0);
+      dsp_.intra_edge_upsampler(left_column, num_pixels);
+    }
+  }
+
+  if (prediction_angle < 90) {
+    const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle);
+    dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height,
+                                           dx, upsampled_top);
+  } else if (prediction_angle < 180) {
+    const int dx =
+        GetDirectionalIntraPredictorDerivative(180 - prediction_angle);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(prediction_angle - 90);
+    dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column,
+                                           width, height, dx, dy, upsampled_top,
+                                           upsampled_left);
+  } else {
+    assert(prediction_angle < 270);
+    const int dy =
+        GetDirectionalIntraPredictorDerivative(270 - prediction_angle);
+    dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width,
+                                           height, dy, upsampled_left);
+  }
+}
+
+template <typename Pixel>
+void Tile::PalettePrediction(const Block& block, const Plane plane,
+                             const int start_x, const int start_y, const int x,
+                             const int y, const TransformSize tx_size) {
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const uint16_t* const palette =
+      block.bp->prediction_parameters->palette_mode_info.color[plane];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const int x4 = MultiplyBy4(x);
+  const int y4 = MultiplyBy4(y);
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  for (int row = 0; row < tx_height; ++row) {
+    assert(block.bp->prediction_parameters
+               ->color_index_map[plane_type][y4 + row] != nullptr);
+    for (int column = 0; column < tx_width; ++column) {
+      buffer[start_y + row][start_x + column] =
+          palette[block.bp->prediction_parameters
+                      ->color_index_map[plane_type][y4 + row][x4 + column]];
+    }
+  }
+}
+
+template void Tile::PalettePrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::PalettePrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const int x, const int y, const TransformSize tx_size);
+#endif
+
+template <typename Pixel>
+void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane,
+                                    const int start_x, const int start_y,
+                                    const TransformSize tx_size) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  Array2DView<Pixel> y_buffer(
+      buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel),
+      reinterpret_cast<Pixel*>(&buffer_[kPlaneY][0][0]));
+  if (!block.scratch_buffer->cfl_luma_buffer_valid) {
+    const int luma_x = start_x << subsampling_x;
+    const int luma_y = start_y << subsampling_y;
+    dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y](
+        block.scratch_buffer->cfl_luma_buffer,
+        prediction_parameters.max_luma_width - luma_x,
+        prediction_parameters.max_luma_height - luma_y,
+        reinterpret_cast<uint8_t*>(&y_buffer[luma_y][luma_x]),
+        buffer_[kPlaneY].columns());
+    block.scratch_buffer->cfl_luma_buffer_valid = true;
+  }
+  Array2DView<Pixel> buffer(buffer_[plane].rows(),
+                            buffer_[plane].columns() / sizeof(Pixel),
+                            reinterpret_cast<Pixel*>(&buffer_[plane][0][0]));
+  dsp_.cfl_intra_predictors[tx_size](
+      reinterpret_cast<uint8_t*>(&buffer[start_y][start_x]),
+      buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer,
+      (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u
+                         : prediction_parameters.cfl_alpha_v);
+}
+
+template void Tile::ChromaFromLumaPrediction<uint8_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template void Tile::ChromaFromLumaPrediction<uint16_t>(
+    const Block& block, const Plane plane, const int start_x, const int start_y,
+    const TransformSize tx_size);
+#endif
+
+void Tile::InterIntraPrediction(
+    uint16_t* const prediction_0, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride,
+    const PredictionParameters& prediction_parameters,
+    const int prediction_width, const int prediction_height,
+    const int subsampling_x, const int subsampling_y, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  assert(prediction_mask != nullptr);
+  assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra ||
+         prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeWedge);
+  // The first buffer of InterIntra is from inter prediction.
+  // The second buffer is from intra prediction.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true,
+                     prediction_parameters.is_wedge_inter_intra, subsampling_x,
+                     subsampling_y)(
+        prediction_0, reinterpret_cast<uint16_t*>(dest),
+        dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride,
+        prediction_width, prediction_height, dest, dest_stride);
+    return;
+  }
+#endif
+  const int function_index = prediction_parameters.is_wedge_inter_intra
+                                 ? subsampling_x + subsampling_y
+                                 : 0;
+  // |is_inter_intra| prediction values are stored in a Pixel buffer but it is
+  // currently declared as a uint16_t buffer.
+  // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and
+  // remove the reinterpret_cast.
+  dsp_.inter_intra_mask_blend_8bpp[function_index](
+      reinterpret_cast<uint8_t*>(prediction_0), dest, dest_stride,
+      prediction_mask, prediction_mask_stride, prediction_width,
+      prediction_height);
+}
+
+void Tile::CompoundInterPrediction(
+    const Block& block, const uint8_t* const prediction_mask,
+    const ptrdiff_t prediction_mask_stride, const int prediction_width,
+    const int prediction_height, const int subsampling_x,
+    const int subsampling_y, const int candidate_row,
+    const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) {
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+
+  void* prediction[2];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  if (bitdepth > 8) {
+    prediction[0] = block.scratch_buffer->prediction_buffer[0];
+    prediction[1] = block.scratch_buffer->prediction_buffer[1];
+  } else {
+#endif
+    prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0];
+    prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1];
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  }
+#endif
+
+  switch (prediction_parameters.compound_prediction_type) {
+    case kCompoundPredictionTypeWedge:
+    case kCompoundPredictionTypeDiffWeighted:
+      GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false,
+                       prediction_parameters.is_wedge_inter_intra,
+                       subsampling_x, subsampling_y)(
+          prediction[0], prediction[1],
+          /*prediction_stride=*/prediction_width, prediction_mask,
+          prediction_mask_stride, prediction_width, prediction_height, dest,
+          dest_stride);
+      break;
+    case kCompoundPredictionTypeDistance:
+      DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width,
+                                 prediction_height, candidate_row,
+                                 candidate_column, dest, dest_stride);
+      break;
+    default:
+      assert(prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeAverage);
+      dsp_.average_blend(prediction[0], prediction[1], prediction_width,
+                         prediction_height, dest, dest_stride);
+      break;
+  }
+}
+
+GlobalMotion* Tile::GetWarpParams(
+    const Block& block, const Plane plane, const int prediction_width,
+    const int prediction_height,
+    const PredictionParameters& prediction_parameters,
+    const ReferenceFrameType reference_type, bool* const is_local_valid,
+    GlobalMotion* const global_motion_params,
+    GlobalMotion* const local_warp_params) const {
+  if (prediction_width < 8 || prediction_height < 8 ||
+      frame_header_.force_integer_mv == 1) {
+    return nullptr;
+  }
+  if (plane == kPlaneY) {
+    *is_local_valid =
+        prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+        WarpEstimation(
+            prediction_parameters.num_warp_samples, DivideBy4(prediction_width),
+            DivideBy4(prediction_height), block.row4x4, block.column4x4,
+            block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates,
+            local_warp_params) &&
+        SetupShear(local_warp_params);
+  }
+  if (prediction_parameters.motion_mode == kMotionModeLocalWarp &&
+      *is_local_valid) {
+    return local_warp_params;
+  }
+  if (!IsScaled(reference_type)) {
+    GlobalMotionTransformationType global_motion_type =
+        (reference_type != kReferenceFrameIntra)
+            ? global_motion_params->type
+            : kNumGlobalMotionTransformationTypes;
+    const bool is_global_valid =
+        IsGlobalMvBlock(*block.bp, global_motion_type) &&
+        SetupShear(global_motion_params);
+    // Valid global motion type implies reference type can't be intra.
+    assert(!is_global_valid || reference_type != kReferenceFrameIntra);
+    if (is_global_valid) return global_motion_params;
+  }
+  return nullptr;
+}
+
+bool Tile::InterPrediction(const Block& block, const Plane plane, const int x,
+                           const int y, const int prediction_width,
+                           const int prediction_height, int candidate_row,
+                           int candidate_column, bool* const is_local_valid,
+                           GlobalMotion* const local_warp_params) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const BlockParameters& bp = *block.bp;
+  const BlockParameters& bp_reference =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  const bool is_compound =
+      bp_reference.reference_frame[1] > kReferenceFrameIntra;
+  assert(bp.is_inter);
+  const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra;
+
+  const PredictionParameters& prediction_parameters =
+      *block.bp->prediction_parameters;
+  uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t dest_stride = buffer_[plane].columns();  // In bytes.
+  for (int index = 0; index < 1 + static_cast<int>(is_compound); ++index) {
+    const ReferenceFrameType reference_type =
+        bp_reference.reference_frame[index];
+    GlobalMotion global_motion_params =
+        frame_header_.global_motion[reference_type];
+    GlobalMotion* warp_params =
+        GetWarpParams(block, plane, prediction_width, prediction_height,
+                      prediction_parameters, reference_type, is_local_valid,
+                      &global_motion_params, local_warp_params);
+    if (warp_params != nullptr) {
+      if (!BlockWarpProcess(block, plane, index, x, y, prediction_width,
+                            prediction_height, warp_params, is_compound,
+                            is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    } else {
+      const int reference_index =
+          prediction_parameters.use_intra_block_copy
+              ? -1
+              : frame_header_.reference_frame_index[reference_type -
+                                                    kReferenceFrameLast];
+      if (!BlockInterPrediction(
+              block, plane, reference_index, bp_reference.mv.mv[index], x, y,
+              prediction_width, prediction_height, candidate_row,
+              candidate_column, block.scratch_buffer->prediction_buffer[index],
+              is_compound, is_inter_intra, dest, dest_stride)) {
+        return false;
+      }
+    }
+  }
+
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  ptrdiff_t prediction_mask_stride = 0;
+  const uint8_t* prediction_mask = nullptr;
+  if (prediction_parameters.compound_prediction_type ==
+      kCompoundPredictionTypeWedge) {
+    const Array2D<uint8_t>& wedge_mask =
+        wedge_masks_[GetWedgeBlockSizeIndex(block.size)]
+                    [prediction_parameters.wedge_sign]
+                    [prediction_parameters.wedge_index];
+    prediction_mask = wedge_mask[0];
+    prediction_mask_stride = wedge_mask.columns();
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeIntra) {
+    // 7.11.3.13. The inter intra masks are precomputed and stored as a set of
+    // look up tables.
+    assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes);
+    prediction_mask =
+        kInterIntraMasks[prediction_parameters.inter_intra_mode]
+                        [GetInterIntraMaskLookupIndex(prediction_width)]
+                        [GetInterIntraMaskLookupIndex(prediction_height)];
+    prediction_mask_stride = prediction_width;
+  } else if (prediction_parameters.compound_prediction_type ==
+             kCompoundPredictionTypeDiffWeighted) {
+    if (plane == kPlaneY) {
+      assert(prediction_width >= 8);
+      assert(prediction_height >= 8);
+      dsp_.weight_mask[FloorLog2(prediction_width) - 3]
+                      [FloorLog2(prediction_height) - 3]
+                      [static_cast<int>(prediction_parameters.mask_is_inverse)](
+                          block.scratch_buffer->prediction_buffer[0],
+                          block.scratch_buffer->prediction_buffer[1],
+                          block.scratch_buffer->weight_mask,
+                          kMaxSuperBlockSizeInPixels);
+    }
+    prediction_mask = block.scratch_buffer->weight_mask;
+    prediction_mask_stride = kMaxSuperBlockSizeInPixels;
+  }
+
+  if (is_compound) {
+    CompoundInterPrediction(block, prediction_mask, prediction_mask_stride,
+                            prediction_width, prediction_height, subsampling_x,
+                            subsampling_y, candidate_row, candidate_column,
+                            dest, dest_stride);
+  } else if (prediction_parameters.motion_mode == kMotionModeObmc) {
+    // Obmc mode is allowed only for single reference (!is_compound).
+    return ObmcPrediction(block, plane, prediction_width, prediction_height);
+  } else if (is_inter_intra) {
+    // InterIntra and obmc must be mutually exclusive.
+    InterIntraPrediction(
+        block.scratch_buffer->prediction_buffer[0], prediction_mask,
+        prediction_mask_stride, prediction_parameters, prediction_width,
+        prediction_height, subsampling_x, subsampling_y, dest, dest_stride);
+  }
+  return true;
+}
+
+bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv,
+                               const Plane plane,
+                               const int reference_frame_index, const int width,
+                               const int height, const int x, const int y,
+                               const int candidate_row,
+                               const int candidate_column,
+                               const ObmcDirection blending_direction) {
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  // Obmc's prediction needs to be clipped before blending with above/left
+  // prediction blocks.
+  // Obmc prediction is used only when is_compound is false. So it is safe to
+  // use prediction_buffer[1] as a temporary buffer for the Obmc prediction.
+  static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >=
+                    64 * 64 * sizeof(uint16_t),
+                "");
+  auto* const obmc_buffer =
+      reinterpret_cast<uint8_t*>(block.scratch_buffer->prediction_buffer[1]);
+  const ptrdiff_t obmc_buffer_stride =
+      (bitdepth == 8) ? width : width * sizeof(uint16_t);
+  if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y,
+                            width, height, candidate_row, candidate_column,
+                            nullptr, false, false, obmc_buffer,
+                            obmc_buffer_stride)) {
+    return false;
+  }
+
+  uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth);
+  const ptrdiff_t prediction_stride = buffer_[plane].columns();
+  dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width,
+                                      height, obmc_buffer, obmc_buffer_stride);
+  return true;
+}
+
+bool Tile::ObmcPrediction(const Block& block, const Plane plane,
+                          const int width, const int height) {
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  if (block.top_available[kPlaneY] &&
+      !IsBlockSmallerThan8x8(block.residual_size[plane])) {
+    const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]);
+    const int column4x4_max =
+        std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+    const int candidate_row = block.row4x4 - 1;
+    const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y;
+    int column4x4 = block.column4x4;
+    const int prediction_height = std::min(height >> 1, 32 >> subsampling_y);
+    for (int i = 0, step; i < num_limit && column4x4 < column4x4_max;
+         column4x4 += step) {
+      const int candidate_column = column4x4 | 1;
+      const BlockParameters& bp_top =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_top.size;
+      step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16);
+      if (bp_top.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_top.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_width =
+            std::min(width, MultiplyBy4(step) >> subsampling_x);
+        if (!ObmcBlockPrediction(
+                block, bp_top.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height,
+                MultiplyBy4(column4x4) >> subsampling_x, block_start_y,
+                candidate_row, candidate_column, kObmcDirectionVertical)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  if (block.left_available[kPlaneY]) {
+    const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]);
+    const int row4x4_max =
+        std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+    const int candidate_column = block.column4x4 - 1;
+    int row4x4 = block.row4x4;
+    const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x;
+    const int prediction_width = std::min(width >> 1, 32 >> subsampling_x);
+    for (int i = 0, step; i < num_limit && row4x4 < row4x4_max;
+         row4x4 += step) {
+      const int candidate_row = row4x4 | 1;
+      const BlockParameters& bp_left =
+          *block_parameters_holder_.Find(candidate_row, candidate_column);
+      const int candidate_block_size = bp_left.size;
+      step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16);
+      if (bp_left.reference_frame[0] > kReferenceFrameIntra) {
+        i++;
+        const int candidate_reference_frame_index =
+            frame_header_.reference_frame_index[bp_left.reference_frame[0] -
+                                                kReferenceFrameLast];
+        const int prediction_height =
+            std::min(height, MultiplyBy4(step) >> subsampling_y);
+        if (!ObmcBlockPrediction(
+                block, bp_left.mv.mv[0], plane, candidate_reference_frame_index,
+                prediction_width, prediction_height, block_start_x,
+                MultiplyBy4(row4x4) >> subsampling_y, candidate_row,
+                candidate_column, kObmcDirectionHorizontal)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1,
+                                      const int width, const int height,
+                                      const int candidate_row,
+                                      const int candidate_column, uint8_t* dest,
+                                      ptrdiff_t dest_stride) {
+  int distance[2];
+  int weight[2];
+  for (int reference = 0; reference < 2; ++reference) {
+    const BlockParameters& bp =
+        *block_parameters_holder_.Find(candidate_row, candidate_column);
+    // Note: distance[0] and distance[1] correspond to relative distance
+    // between current frame and reference frame [1] and [0], respectively.
+    distance[1 - reference] = std::min(
+        std::abs(static_cast<int>(
+            current_frame_.reference_info()
+                ->relative_distance_from[bp.reference_frame[reference]])),
+        static_cast<int>(kMaxFrameDistance));
+  }
+  GetDistanceWeights(distance, weight);
+
+  dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1],
+                               width, height, dest, dest_stride);
+}
+
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+                             const int reference_frame_index, const int x,
+                             const int y, int* const start_x,
+                             int* const start_y, int* const step_x,
+                             int* const step_y) {
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? frame_header_.upscaled_width
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? frame_header_.height
+          : reference_frames_[reference_frame_index]->frame_height();
+  assert(2 * frame_header_.width >= reference_upscaled_width &&
+         2 * frame_header_.height >= reference_height &&
+         frame_header_.width <= 16 * reference_upscaled_width &&
+         frame_header_.height <= 16 * reference_height);
+  const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+  const bool is_scaled_y = reference_height != frame_header_.height;
+  const int half_sample = 1 << (kSubPixelBits - 1);
+  int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+  int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+  const int rounding_offset =
+      DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+  if (is_scaled_x) {
+    const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.width)) /
+                        frame_header_.width;
+    *step_x = RightShiftWithRoundingSigned(
+        scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_x += half_sample;
+    // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+    // be up to 15 bits. So we use int64_t to hold base_x.
+    const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+                           (half_sample << kReferenceScaleShift);
+    *start_x =
+        RightShiftWithRoundingSigned(
+            base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_x = 1 << kScaleSubPixelBits;
+    *start_x = LeftShift(orig_x, 6) + rounding_offset;
+  }
+  if (is_scaled_y) {
+    const int scale_y = ((reference_height << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.height)) /
+                        frame_header_.height;
+    *step_y = RightShiftWithRoundingSigned(
+        scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_y += half_sample;
+    const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+                           (half_sample << kReferenceScaleShift);
+    *start_y =
+        RightShiftWithRoundingSigned(
+            base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_y = 1 << kScaleSubPixelBits;
+    *start_y = LeftShift(orig_y, 6) + rounding_offset;
+  }
+}
+
+// static.
+bool Tile::GetReferenceBlockPosition(
+    const int reference_frame_index, const bool is_scaled, const int width,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int start_x,
+    const int start_y, const int step_x, const int step_y,
+    const int left_border, const int right_border, const int top_border,
+    const int bottom_border, int* ref_block_start_x, int* ref_block_start_y,
+    int* ref_block_end_x) {
+  *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0);
+  *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0);
+  if (reference_frame_index == -1) {
+    return false;
+  }
+  *ref_block_start_x -= kConvolveBorderLeftTop;
+  *ref_block_start_y -= kConvolveBorderLeftTop;
+  *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) +
+                     kConvolveBorderRight;
+  int ref_block_end_y =
+      GetPixelPositionFromHighScale(start_y, step_y, height - 1) +
+      kConvolveBorderBottom;
+  if (is_scaled) {
+    const int block_height =
+        (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+         kScaleSubPixelBits) +
+        kSubPixelTaps;
+    *ref_block_end_x += kConvolveScaleBorderRight - kConvolveBorderRight;
+    ref_block_end_y = *ref_block_start_y + block_height - 1;
+  }
+  // Determines if we need to extend beyond the left/right/top/bottom border.
+  return *ref_block_start_x < (ref_start_x - left_border) ||
+         *ref_block_end_x > (ref_last_x + right_border) ||
+         *ref_block_start_y < (ref_start_y - top_border) ||
+         ref_block_end_y > (ref_last_y + bottom_border);
+}
+
+// Builds a block as the input for convolve, by copying the content of
+// reference frame (either a decoded reference frame, or current frame).
+// |block_extended_width| is the combined width of the block and its borders.
+template <typename Pixel>
+void Tile::BuildConvolveBlock(
+    const Plane plane, const int reference_frame_index, const bool is_scaled,
+    const int height, const int ref_start_x, const int ref_last_x,
+    const int ref_start_y, const int ref_last_y, const int step_y,
+    const int ref_block_start_x, const int ref_block_end_x,
+    const int ref_block_start_y, uint8_t* block_buffer,
+    ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) {
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  Array2DView<const Pixel> reference_block(
+      reference_buffer->height(plane),
+      reference_buffer->stride(plane) / sizeof(Pixel),
+      reinterpret_cast<const Pixel*>(reference_buffer->data(plane)));
+  auto* const block_head = reinterpret_cast<Pixel*>(block_buffer);
+  convolve_buffer_stride /= sizeof(Pixel);
+  int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom;
+  if (is_scaled) {
+    block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
+                    kScaleSubPixelBits) +
+                   kSubPixelTaps;
+  }
+  const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+  const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+  const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
+  const int block_width = copy_end_x - copy_start_x + 1;
+  const bool extend_left = ref_block_start_x < ref_start_x;
+  const bool extend_right = ref_block_end_x > ref_last_x;
+  const bool out_of_left = copy_start_x > ref_block_end_x;
+  const bool out_of_right = copy_end_x < ref_block_start_x;
+  if (out_of_left || out_of_right) {
+    const int ref_x = out_of_left ? copy_start_x : copy_end_x;
+    Pixel* buf_ptr = block_head;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width);
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  } else {
+    Pixel* buf_ptr = block_head;
+    const int left_width = copy_start_x - ref_block_start_x;
+    for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) {
+      if (extend_left) {
+        Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width);
+      }
+      memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x],
+             block_width * sizeof(Pixel));
+      if (extend_right) {
+        Memset(buf_ptr + left_width + block_width,
+               reference_block[ref_y][copy_end_x],
+               block_extended_width - left_width - block_width);
+      }
+      if (ref_block_start_y + y >= ref_start_y &&
+          ref_block_start_y + y < ref_last_y) {
+        ++ref_y;
+      }
+      buf_ptr += convolve_buffer_stride;
+    }
+  }
+}
+
+bool Tile::BlockInterPrediction(
+    const Block& block, const Plane plane, const int reference_frame_index,
+    const MotionVector& mv, const int x, const int y, const int width,
+    const int height, const int candidate_row, const int candidate_column,
+    uint16_t* const prediction, const bool is_compound,
+    const bool is_inter_intra, uint8_t* const dest,
+    const ptrdiff_t dest_stride) {
+  const BlockParameters& bp =
+      *block_parameters_holder_.Find(candidate_row, candidate_column);
+  int start_x;
+  int start_y;
+  int step_x;
+  int step_y;
+  ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y,
+                    &step_x, &step_y);
+  const int horizontal_filter_index = bp.interpolation_filter[1];
+  const int vertical_filter_index = bp.interpolation_filter[0];
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  // reference_frame_index equal to -1 indicates using current frame as
+  // reference.
+  const YuvBuffer* const reference_buffer =
+      (reference_frame_index == -1)
+          ? current_frame_.buffer()
+          : reference_frames_[reference_frame_index]->buffer();
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.columns4x4)
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? MultiplyBy4(frame_header_.rows4x4)
+          : reference_frames_[reference_frame_index]->frame_height();
+  const int ref_start_x = 0;
+  const int ref_last_x =
+      SubsampledValue(reference_upscaled_width, subsampling_x) - 1;
+  const int ref_start_y = 0;
+  const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1;
+
+  const bool is_scaled = (reference_frame_index != -1) &&
+                         (frame_header_.width != reference_upscaled_width ||
+                          frame_header_.height != reference_height);
+  const int bitdepth = sequence_header_.color_config.bitdepth;
+  const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  int ref_block_start_x;
+  int ref_block_start_y;
+  int ref_block_end_x;
+  const bool extend_block = GetReferenceBlockPosition(
+      reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x,
+      ref_start_y, ref_last_y, start_x, start_y, step_x, step_y,
+      reference_buffer->left_border(plane),
+      reference_buffer->right_border(plane),
+      reference_buffer->top_border(plane),
+      reference_buffer->bottom_border(plane), &ref_block_start_x,
+      &ref_block_start_y, &ref_block_end_x);
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (reference_frame_index != -1 && frame_parallel_) {
+    int reference_y_max;
+    if (is_scaled) {
+      // TODO(vigneshv): For now, we wait for the entire reference frame to be
+      // decoded if we are using scaled references. This will eventually be
+      // fixed.
+      reference_y_max = reference_height;
+    } else {
+      reference_y_max =
+          std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y);
+      // For U and V planes with subsampling, we need to multiply
+      // reference_y_max by 2 since we only track the progress of Y planes.
+      reference_y_max = LeftShift(reference_y_max, subsampling_y);
+    }
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+
+  const uint8_t* block_start = nullptr;
+  ptrdiff_t convolve_buffer_stride;
+  if (!extend_block) {
+    const YuvBuffer* const reference_buffer =
+        (reference_frame_index == -1)
+            ? current_frame_.buffer()
+            : reference_frames_[reference_frame_index]->buffer();
+    convolve_buffer_stride = reference_buffer->stride(plane);
+    if (reference_frame_index == -1 || is_scaled) {
+      block_start = reference_buffer->data(plane) +
+                    ref_block_start_y * reference_buffer->stride(plane) +
+                    ref_block_start_x * pixel_size;
+    } else {
+      block_start = reference_buffer->data(plane) +
+                    (ref_block_start_y + kConvolveBorderLeftTop) *
+                        reference_buffer->stride(plane) +
+                    (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size;
+    }
+  } else {
+    const int border_right =
+        is_scaled ? kConvolveScaleBorderRight : kConvolveBorderRight;
+    // The block width can be at most 2 times as much as current
+    // block's width because of scaling.
+    auto block_extended_width = Align<ptrdiff_t>(
+        (2 * width + kConvolveBorderLeftTop + border_right) * pixel_size,
+        kMaxAlignment);
+    convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      BuildConvolveBlock<uint16_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+    } else {
+#endif
+      BuildConvolveBlock<uint8_t>(
+          plane, reference_frame_index, is_scaled, height, ref_start_x,
+          ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x,
+          ref_block_end_x, ref_block_start_y,
+          block.scratch_buffer->convolve_block_buffer.get(),
+          convolve_buffer_stride, block_extended_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif
+    block_start = block.scratch_buffer->convolve_block_buffer.get() +
+                  (is_scaled ? 0
+                             : kConvolveBorderLeftTop * convolve_buffer_stride +
+                                   kConvolveBorderLeftTop * pixel_size);
+  }
+
+  void* const output =
+      (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
+  ptrdiff_t output_stride = (is_compound || is_inter_intra)
+                                ? /*prediction_stride=*/width
+                                : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  // |is_inter_intra| calculations are written to the |prediction| buffer.
+  // Unlike the |is_compound| calculations the output is Pixel and not uint16_t.
+  // convolve_func() expects |output_stride| to be in bytes and not Pixels.
+  // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to
+  // account for this.
+  if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+    output_stride *= 2;
+  }
+#endif
+  assert(output != nullptr);
+  if (is_scaled) {
+    dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, start_x, start_y, step_x, step_y,
+                  width, height, output, output_stride);
+  } else {
+    const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+    const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
+    dsp::ConvolveFunc convolve_func =
+        dsp_.convolve[reference_frame_index == -1][is_compound]
+                     [vertical_filter_id != 0][horizontal_filter_id != 0];
+    assert(convolve_func != nullptr);
+
+    convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
+                  vertical_filter_index, horizontal_filter_id,
+                  vertical_filter_id, width, height, output, output_stride);
+  }
+  return true;
+}
+
+bool Tile::BlockWarpProcess(const Block& block, const Plane plane,
+                            const int index, const int block_start_x,
+                            const int block_start_y, const int width,
+                            const int height, GlobalMotion* const warp_params,
+                            const bool is_compound, const bool is_inter_intra,
+                            uint8_t* const dest, const ptrdiff_t dest_stride) {
+  assert(width >= 8 && height >= 8);
+  const BlockParameters& bp = *block.bp;
+  const int reference_frame_index =
+      frame_header_.reference_frame_index[bp.reference_frame[index] -
+                                          kReferenceFrameLast];
+  const uint8_t* const source =
+      reference_frames_[reference_frame_index]->buffer()->data(plane);
+  ptrdiff_t source_stride =
+      reference_frames_[reference_frame_index]->buffer()->stride(plane);
+  const int source_width =
+      reference_frames_[reference_frame_index]->buffer()->width(plane);
+  const int source_height =
+      reference_frames_[reference_frame_index]->buffer()->height(plane);
+  uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index];
+
+  // In frame parallel mode, ensure that the reference block has been decoded
+  // and available for referencing.
+  if (frame_parallel_) {
+    int reference_y_max = -1;
+    // Find out the maximum y-coordinate for warping.
+    for (int start_y = block_start_y; start_y < block_start_y + height;
+         start_y += 8) {
+      for (int start_x = block_start_x; start_x < block_start_x + width;
+           start_x += 8) {
+        const int src_x = (start_x + 4) << subsampling_x_[plane];
+        const int src_y = (start_y + 4) << subsampling_y_[plane];
+        const int dst_y = src_x * warp_params->params[4] +
+                          src_y * warp_params->params[5] +
+                          warp_params->params[1];
+        const int y4 = dst_y >> subsampling_y_[plane];
+        const int iy4 = y4 >> kWarpedModelPrecisionBits;
+        reference_y_max = std::max(iy4 + 8, reference_y_max);
+      }
+    }
+    // For U and V planes with subsampling, we need to multiply reference_y_max
+    // by 2 since we only track the progress of Y planes.
+    reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]);
+    if (reference_frame_progress_cache_[reference_frame_index] <
+            reference_y_max &&
+        !reference_frames_[reference_frame_index]->WaitUntil(
+            reference_y_max,
+            &reference_frame_progress_cache_[reference_frame_index])) {
+      return false;
+    }
+  }
+  if (is_compound) {
+    dsp_.warp_compound(source, source_stride, source_width, source_height,
+                       warp_params->params, subsampling_x_[plane],
+                       subsampling_y_[plane], block_start_x, block_start_y,
+                       width, height, warp_params->alpha, warp_params->beta,
+                       warp_params->gamma, warp_params->delta, prediction,
+                       /*prediction_stride=*/width);
+  } else {
+    void* const output = is_inter_intra ? static_cast<void*>(prediction) : dest;
+    ptrdiff_t output_stride =
+        is_inter_intra ? /*prediction_stride=*/width : dest_stride;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    // |is_inter_intra| calculations are written to the |prediction| buffer.
+    // Unlike the |is_compound| calculations the output is Pixel and not
+    // uint16_t. warp_clip() expects |output_stride| to be in bytes and not
+    // Pixels. |prediction_stride| is in units of uint16_t. Adjust
+    // |output_stride| to account for this.
+    if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) {
+      output_stride *= 2;
+    }
+#endif
+    dsp_.warp(source, source_stride, source_width, source_height,
+              warp_params->params, subsampling_x_[plane], subsampling_y_[plane],
+              block_start_x, block_start_y, width, height, warp_params->alpha,
+              warp_params->beta, warp_params->gamma, warp_params->delta, output,
+              output_stride);
+  }
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/tile/tile.cc b/src/tile/tile.cc
new file mode 100644
index 0000000..5070bb6
--- /dev/null
+++ b/src/tile/tile.cc
@@ -0,0 +1,2670 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "src/frame_scratch_buffer.h"
+#include "src/motion_vector.h"
+#include "src/reconstruction.h"
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/segmentation.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+namespace {
+
+// Import all the constants in the anonymous namespace.
+#include "src/scan_tables.inc"
+
+// Range above kNumQuantizerBaseLevels which the exponential golomb coding
+// process is activated.
+constexpr int kQuantizerCoefficientBaseRange = 12;
+constexpr int kNumQuantizerBaseLevels = 2;
+constexpr int kCoeffBaseRangeMaxIterations =
+    kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
+constexpr int kEntropyContextLeft = 0;
+constexpr int kEntropyContextTop = 1;
+
+constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {2, 4, 4, 4, 5},
+                                                     {3, 5, 5, 5, 6}};
+
+// The space complexity of DFS is O(branching_factor * max_depth). For the
+// parameter tree, branching_factor = 4 (there could be up to 4 children for
+// every node) and max_depth (excluding the root) = 5 (to go from a 128x128
+// block all the way to a 4x4 block). The worse-case stack size is 16, by
+// counting the number of 'o' nodes in the diagram:
+//
+//   |                    128x128  The highest level (corresponding to the
+//   |                             root of the tree) has no node in the stack.
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  64x64
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  32x32    Higher levels have three nodes in the stack,
+//   |                             because we pop one node off the stack before
+//   |-----------------+           pushing its four children onto the stack.
+//   |     |     |     |
+//   |     o     o     o  16x16
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   |     o     o     o  8x8
+//   |
+//   |-----------------+
+//   |     |     |     |
+//   o     o     o     o  4x4      Only the lowest level has four nodes in the
+//                                 stack.
+constexpr int kDfsStackSize = 16;
+
+// Mask indicating whether the transform sets contain a particular transform
+// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
+constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
+    BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
+    BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
+
+constexpr PredictionMode
+    kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
+        kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+        kPredictionModeD157, kPredictionModeDc};
+
+// Mask used to determine the index for mode_deltas lookup.
+constexpr BitMaskSet kPredictionModeDeltasMask(
+    kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
+    kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
+    kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
+    kPredictionModeNearNewMv, kPredictionModeNewNearMv,
+    kPredictionModeNewNewMv);
+
+// This is computed as:
+// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
+constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
+    0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
+
+/* clang-format off */
+constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
+    {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
+     {0, 0, 0, 0, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
+     {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
+     {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
+    {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
+     {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
+/* clang-format on */
+
+// Extended the table size from 3 to 16 by repeating the last element to avoid
+// the clips to row or column indices.
+constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
+    26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
+
+constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
+    kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
+    kPredictionModeSmooth};
+
+// Number of horizontal luma samples before intra block copy can be used.
+constexpr int kIntraBlockCopyDelayPixels = 256;
+// Number of 64 by 64 blocks before intra block copy can be used.
+constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
+
+// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
+// height 1 << (j + 2).
+constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
+    {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
+     kNumTransformSizes, kNumTransformSizes},
+    {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
+     kTransformSize8x32, kNumTransformSizes},
+    {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
+     kTransformSize16x32, kTransformSize16x64},
+    {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
+     kTransformSize32x32, kTransformSize32x64},
+    {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
+     kTransformSize64x32, kTransformSize64x64}};
+
+// Defined in section 9.3 of the spec.
+constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
+    kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
+    kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
+    kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
+    kTransformTypeAdstAdst, kTransformTypeDctDct};
+
+// Defined in section 5.11.47 of the spec. This array does not contain an entry
+// for kTransformSetDctOnly, so the first dimension needs to be
+// |kNumTransformSets| - 1.
+constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
+    {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeIdentityDct, kTransformTypeDctIdentity,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
+      kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
+      kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
+      kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
+      kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
+      kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
+      kTransformTypeAdstDct, kTransformTypeDctFlipadst,
+      kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
+      kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
+      kTransformTypeAdstFlipadst},
+     {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
+
+// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
+constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
+// transforms replaced with *x32 and 32x* respectively.
+constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
+    kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
+    kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
+    kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
+    kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
+    kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize32x32};
+
+// ith entry of this array is computed as:
+// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
+//           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
+//           1)
+constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
+    0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
+
+constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
+
+constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
+
+// Maps compound prediction modes into single modes. For e.g.
+// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
+// and kPredictionModeNewMv for index 1. It is used to simplify the logic in
+// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
+constexpr PredictionMode
+    kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
+        {kPredictionModeNearestMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNearMv},
+        {kPredictionModeNearestMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearestMv},
+        {kPredictionModeNearMv, kPredictionModeNewMv},
+        {kPredictionModeNewMv, kPredictionModeNearMv},
+        {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
+        {kPredictionModeNewMv, kPredictionModeNewMv},
+};
+PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
+  if (y_mode < kPredictionModeNearestNearestMv) {
+    return y_mode;
+  }
+  const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
+  assert(lookup_index >= 0);
+  return kCompoundToSinglePredictionMode[lookup_index][index];
+}
+
+// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
+// dqDenom is always a power of two and hence right shift can be used instead of
+// division.
+constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
+
+// Returns the minimum of |length| or |max|-|start|. This is used to clamp array
+// indices when accessing arrays whose bound is equal to |max|.
+int GetNumElements(int length, int start, int max) {
+  return std::min(length, max - start);
+}
+
+template <typename T>
+void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  // Specialize all columns cases (values in kTransformWidth4x4[]) for better
+  // performance.
+  switch (columns) {
+    case 1:
+      MemSetBlock<T>(rows, 1, value, dst, stride);
+      break;
+    case 2:
+      MemSetBlock<T>(rows, 2, value, dst, stride);
+      break;
+    case 4:
+      MemSetBlock<T>(rows, 4, value, dst, stride);
+      break;
+    case 8:
+      MemSetBlock<T>(rows, 8, value, dst, stride);
+      break;
+    default:
+      assert(columns == 16);
+      MemSetBlock<T>(rows, 16, value, dst, stride);
+      break;
+  }
+}
+
+void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
+                      TransformType tx_type,
+                      TransformType transform_types[32][32]) {
+  const int y_offset = y4 - block.row4x4;
+  const int x_offset = x4 - block.column4x4;
+  TransformType* const dst = &transform_types[y_offset][x_offset];
+  SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
+}
+
+void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
+                         const MotionVector& mv_to_store, ptrdiff_t stride,
+                         int rows, int columns,
+                         ReferenceFrameType* reference_frame_row_start,
+                         MotionVector* mv) {
+  static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
+  do {
+    // Don't switch the following two memory setting functions.
+    // Some ARM CPUs are quite sensitive to the order.
+    memset(reference_frame_row_start, reference_frame_to_store, columns);
+    std::fill(mv, mv + columns, mv_to_store);
+    reference_frame_row_start += stride;
+    mv += stride;
+  } while (--rows != 0);
+}
+
+// Inverse transform process assumes that the quantized coefficients are stored
+// as a virtual 2d array of size |tx_width| x tx_height. If transform width is
+// 64, then this assumption is broken because the scan order used for populating
+// the coefficients for such transforms is the same as the one used for
+// corresponding transform with width 32 (e.g. the scan order used for 64x16 is
+// the same as the one used for 32x16). So we must restore the coefficients to
+// their correct positions and clean the positions they occupied.
+template <typename ResidualType>
+void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
+                                  ResidualType* residual) {
+  if (tx_width != 64) return;
+  const int rows = clamped_tx_height - 2;
+  auto* src = residual + 32 * rows;
+  residual += 64 * rows;
+  // Process 2 rows in each loop in reverse order to avoid overwrite.
+  int x = rows >> 1;
+  do {
+    // The 2 rows can be processed in order.
+    memcpy(residual, src, 32 * sizeof(src[0]));
+    memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+    memset(src + 32, 0, 32 * sizeof(src[0]));
+    src -= 64;
+    residual -= 128;
+  } while (--x);
+  // Process the second row. The first row is already correct.
+  memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
+  memset(src + 32, 0, 32 * sizeof(src[0]));
+}
+
+void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
+  // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
+  // and 5.11.54).
+  constexpr int kMvBorder4x4 = 4;
+  const int row_border = kMvBorder4x4 + block.height4x4;
+  const int column_border = kMvBorder4x4 + block.width4x4;
+  const int macroblocks_to_top_edge = -block.row4x4;
+  const int macroblocks_to_bottom_edge =
+      block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
+  const int macroblocks_to_left_edge = -block.column4x4;
+  const int macroblocks_to_right_edge =
+      block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
+  min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
+  min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
+  max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
+  max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
+}
+
+// Section 8.3.2 in the spec, under coeff_base_eob.
+int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
+  if (index == 0) return 0;
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_height = kTransformHeight[adjusted_tx_size];
+  if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
+  if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
+  return 3;
+}
+
+// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
+// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
+// the end of block case.
+int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
+                                TransformClass tx_class) {
+  if (pos == 0) return 0;
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  const int row = pos >> adjusted_tx_width_log2;
+  const int column = pos & (tx_width - 1);
+  // This return statement is equivalent to:
+  // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
+  //         (tx_class == kTransformClassHorizontal && column == 0) ||
+  //         (tx_class == kTransformClassVertical && row == 0))
+  //            ? 7
+  //            : 14;
+  return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
+                 static_cast<int>((row | column) < 2)) |
+                (tx_class & static_cast<int>(column == 0)) |
+                ((tx_class >> 1) & static_cast<int>(row == 0)));
+}
+
+}  // namespace
+
+Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
+           const ObuSequenceHeader& sequence_header,
+           const ObuFrameHeader& frame_header,
+           RefCountedBuffer* const current_frame, const DecoderState& state,
+           FrameScratchBuffer* const frame_scratch_buffer,
+           const WedgeMaskArray& wedge_masks,
+           const QuantizerMatrix& quantizer_matrix,
+           SymbolDecoderContext* const saved_symbol_decoder_context,
+           const SegmentationMap* prev_segment_ids,
+           PostFilter* const post_filter, const dsp::Dsp* const dsp,
+           ThreadPool* const thread_pool,
+           BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
+           bool use_intra_prediction_buffer)
+    : number_(tile_number),
+      row_(number_ / frame_header.tile_info.tile_columns),
+      column_(number_ % frame_header.tile_info.tile_columns),
+      data_(data),
+      size_(size),
+      read_deltas_(false),
+      subsampling_x_{0, sequence_header.color_config.subsampling_x,
+                     sequence_header.color_config.subsampling_x},
+      subsampling_y_{0, sequence_header.color_config.subsampling_y,
+                     sequence_header.color_config.subsampling_y},
+      current_quantizer_index_(frame_header.quantizer.base_index),
+      sequence_header_(sequence_header),
+      frame_header_(frame_header),
+      reference_frame_sign_bias_(state.reference_frame_sign_bias),
+      reference_frames_(state.reference_frame),
+      motion_field_(frame_scratch_buffer->motion_field),
+      reference_order_hint_(state.reference_order_hint),
+      wedge_masks_(wedge_masks),
+      quantizer_matrix_(quantizer_matrix),
+      reader_(data_, size_, frame_header_.enable_cdf_update),
+      symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
+      saved_symbol_decoder_context_(saved_symbol_decoder_context),
+      prev_segment_ids_(prev_segment_ids),
+      dsp_(*dsp),
+      post_filter_(*post_filter),
+      block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
+      quantizer_(sequence_header_.color_config.bitdepth,
+                 &frame_header_.quantizer),
+      residual_size_((sequence_header_.color_config.bitdepth == 8)
+                         ? sizeof(int16_t)
+                         : sizeof(int32_t)),
+      intra_block_copy_lag_(
+          frame_header_.allow_intrabc
+              ? (sequence_header_.use_128x128_superblock ? 3 : 5)
+              : 1),
+      current_frame_(*current_frame),
+      cdef_index_(frame_scratch_buffer->cdef_index),
+      cdef_skip_(frame_scratch_buffer->cdef_skip),
+      inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
+      thread_pool_(thread_pool),
+      residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
+      tile_scratch_buffer_pool_(
+          &frame_scratch_buffer->tile_scratch_buffer_pool),
+      pending_tiles_(pending_tiles),
+      frame_parallel_(frame_parallel),
+      use_intra_prediction_buffer_(use_intra_prediction_buffer),
+      intra_prediction_buffer_(
+          use_intra_prediction_buffer_
+              ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
+              : nullptr) {
+  row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
+  row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
+  column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
+  column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
+  superblock_rows_ =
+      (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
+  superblock_columns_ =
+      (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
+      block_width4x4_log2;
+  // If |split_parse_and_decode_| is true, we do the necessary setup for
+  // splitting the parsing and the decoding steps. This is done in the following
+  // two cases:
+  //  1) If there is multi-threading within a tile (this is done if
+  //     |thread_pool_| is not nullptr and if there are at least as many
+  //     superblock columns as |intra_block_copy_lag_|).
+  //  2) If |frame_parallel| is true.
+  split_parse_and_decode_ = (thread_pool_ != nullptr &&
+                             superblock_columns_ > intra_block_copy_lag_) ||
+                            frame_parallel;
+  if (frame_parallel_) {
+    reference_frame_progress_cache_.fill(INT_MIN);
+  }
+  memset(delta_lf_, 0, sizeof(delta_lf_));
+  delta_lf_all_zero_ = true;
+  const YuvBuffer& buffer = post_filter_.frame_buffer();
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    // Verify that the borders are big enough for Reconstruct(). max_tx_length
+    // is the maximum value of tx_width and tx_height for the plane.
+    const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
+    // Reconstruct() may overwrite on the right. Since the right border of a
+    // row is followed in memory by the left border of the next row, the
+    // number of extra pixels to the right of a row is at least the sum of the
+    // left and right borders.
+    //
+    // Note: This assertion actually checks the sum of the left and right
+    // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
+    // and vertically shifted version of |buffer|. Since the sum of the left and
+    // right borders is not changed by the shift, we can just check the sum of
+    // the left and right borders of |buffer|.
+    assert(buffer.left_border(plane) + buffer.right_border(plane) >=
+           max_tx_length - 1);
+    // Reconstruct() may overwrite on the bottom. We need an extra border row
+    // on the bottom because we need the left border of that row.
+    //
+    // Note: This assertion checks the bottom border of
+    // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
+    // shift that the PostFilter constructor applied to |buffer| and reduce the
+    // bottom border by that amount.
+#ifndef NDEBUG
+    const int vertical_shift = static_cast<int>(
+        (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
+        buffer.stride(plane));
+    const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
+    assert(bottom_border >= max_tx_length);
+#endif
+    // In AV1, a transform block of height H starts at a y coordinate that is
+    // a multiple of H. If a transform block at the bottom of the frame has
+    // height H, then Reconstruct() will write up to the row with index
+    // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
+    // rows Reconstruct() may write to is
+    // Align(buffer.height(plane), max_tx_length).
+    buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
+                         buffer.stride(plane),
+                         post_filter_.GetUnfilteredBuffer(plane));
+  }
+}
+
+bool Tile::Init() {
+  assert(coefficient_levels_.size() == dc_categories_.size());
+  for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
+    const int contexts_per_plane = (i == kEntropyContextLeft)
+                                       ? frame_header_.rows4x4
+                                       : frame_header_.columns4x4;
+    if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
+      return false;
+    }
+    if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
+      LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
+      return false;
+    }
+  }
+  if (split_parse_and_decode_) {
+    assert(residual_buffer_pool_ != nullptr);
+    if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
+                                         /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
+      return false;
+    }
+  } else {
+    // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
+    // checks when parsing quantized coefficients.
+    residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
+        32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
+    if (residual_buffer_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
+      return false;
+    }
+    prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
+    if (prediction_parameters_ == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
+      return false;
+    }
+  }
+  if (frame_header_.use_ref_frame_mvs) {
+    assert(sequence_header_.enable_order_hint);
+    SetupMotionField(frame_header_, current_frame_, reference_frames_,
+                     row4x4_start_, row4x4_end_, column4x4_start_,
+                     column4x4_end_, &motion_field_);
+  }
+  ResetLoopRestorationParams();
+  if (!top_context_.Resize(superblock_columns_)) {
+    LIBGAV1_DLOG(ERROR, "Allocation of top_context_ failed.");
+    return false;
+  }
+  return true;
+}
+
+template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
+bool Tile::ProcessSuperBlockRow(int row4x4,
+                                TileScratchBuffer* const scratch_buffer) {
+  if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
+  assert(scratch_buffer != nullptr);
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
+       column4x4 += block_width4x4) {
+    if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
+                           processing_mode)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+  }
+  if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
+    SaveSymbolDecoderContext();
+  }
+  if (processing_mode == kProcessingModeDecodeOnly ||
+      processing_mode == kProcessingModeParseAndDecode) {
+    PopulateIntraPredictionBuffer(row4x4);
+  }
+  return true;
+}
+
+// Used in frame parallel mode. The symbol decoder context need not be saved in
+// this case since it was done when parsing was complete.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+// Used in non frame parallel mode.
+template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+    int row4x4, TileScratchBuffer* scratch_buffer);
+
+void Tile::SaveSymbolDecoderContext() {
+  if (frame_header_.enable_frame_end_update_cdf &&
+      number_ == frame_header_.tile_info.context_update_id) {
+    *saved_symbol_decoder_context_ = symbol_decoder_context_;
+  }
+}
+
+bool Tile::ParseAndDecode() {
+  if (split_parse_and_decode_) {
+    if (!ThreadedParseAndDecode()) return false;
+    SaveSymbolDecoderContext();
+    return true;
+  }
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+            row4x4, scratch_buffer.get())) {
+      pending_tiles_->Decrement(false);
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  pending_tiles_->Decrement(true);
+  return true;
+}
+
+bool Tile::Parse() {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4) {
+    if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  SaveSymbolDecoderContext();
+  return true;
+}
+
+bool Tile::Decode(
+    std::mutex* const mutex, int* const superblock_row_progress,
+    std::condition_variable* const superblock_row_progress_condvar) {
+  const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header_.use_128x128_superblock ? 5 : 4;
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
+       row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
+    if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+            row4x4, scratch_buffer.get())) {
+      return false;
+    }
+    if (post_filter_.DoDeblock()) {
+      // Apply vertical deblock filtering for all the columns in this tile
+      // except for the first 64 columns.
+      post_filter_.ApplyDeblockFilter(
+          kLoopFilterTypeVertical, row4x4,
+          column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
+          block_width4x4);
+      // If this is the first superblock row of the tile, then we cannot apply
+      // horizontal deblocking here since we don't know if the top row is
+      // available. So it will be done by the calling thread in that case.
+      if (row4x4 != row4x4_start_) {
+        // Apply horizontal deblock filtering for all the columns in this tile
+        // except for the first and the last 64 columns.
+        // Note about the last tile of each row: For the last tile,
+        // column4x4_end may not be a multiple of 16. In that case it is still
+        // okay to simply subtract 16 since ApplyDeblockFilter() will only do
+        // the filters in increments of 64 columns (or 32 columns for chroma
+        // with subsampling).
+        post_filter_.ApplyDeblockFilter(
+            kLoopFilterTypeHorizontal, row4x4,
+            column4x4_start_ + kNum4x4InLoopFilterUnit,
+            column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
+      }
+    }
+    bool notify;
+    {
+      std::unique_lock<std::mutex> lock(*mutex);
+      notify = ++superblock_row_progress[index] ==
+               frame_header_.tile_info.tile_columns;
+    }
+    if (notify) {
+      // We are done decoding this superblock row. Notify the post filtering
+      // thread.
+      superblock_row_progress_condvar[index].notify_one();
+    }
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  return true;
+}
+
+bool Tile::ThreadedParseAndDecode() {
+  {
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
+      pending_tiles_->Decrement(false);
+      LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
+      return false;
+    }
+    // Account for the parsing job.
+    ++threading_.pending_jobs;
+  }
+
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+
+  // Begin parsing.
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  if (scratch_buffer == nullptr) {
+    pending_tiles_->Decrement(false);
+    LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
+    return false;
+  }
+  for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
+       row4x4 += block_width4x4, ++row_index) {
+    for (int column4x4 = column4x4_start_, column_index = 0;
+         column4x4 < column4x4_end_;
+         column4x4 += block_width4x4, ++column_index) {
+      if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                             kProcessingModeParseOnly)) {
+        std::lock_guard<std::mutex> lock(threading_.mutex);
+        threading_.abort = true;
+        break;
+      }
+      std::unique_lock<std::mutex> lock(threading_.mutex);
+      if (threading_.abort) break;
+      threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
+      // Schedule the decoding of this superblock if it is allowed.
+      if (CanDecode(row_index, column_index)) {
+        ++threading_.pending_jobs;
+        threading_.sb_state[row_index][column_index] =
+            kSuperBlockStateScheduled;
+        lock.unlock();
+        thread_pool_->Schedule(
+            [this, row_index, column_index, block_width4x4]() {
+              DecodeSuperBlock(row_index, column_index, block_width4x4);
+            });
+      }
+    }
+    std::lock_guard<std::mutex> lock(threading_.mutex);
+    if (threading_.abort) break;
+  }
+  tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+
+  // We are done parsing. We can return here since the calling thread will make
+  // sure that it waits for all the superblocks to be decoded.
+  //
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  threading_.mutex.lock();
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  threading_.mutex.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+  return job_succeeded;
+}
+
+bool Tile::CanDecode(int row_index, int column_index) const {
+  assert(row_index >= 0);
+  assert(column_index >= 0);
+  // If |threading_.sb_state[row_index][column_index]| is not equal to
+  // kSuperBlockStateParsed, then return false. This is ok because if
+  // |threading_.sb_state[row_index][column_index]| is equal to:
+  //   kSuperBlockStateNone - then the superblock is not yet parsed.
+  //   kSuperBlockStateScheduled - then the superblock is already scheduled for
+  //                               decode.
+  //   kSuperBlockStateDecoded - then the superblock has already been decoded.
+  if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
+      threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
+    return false;
+  }
+  // First superblock has no dependencies.
+  if (row_index == 0 && column_index == 0) {
+    return true;
+  }
+  // Superblocks in the first row only depend on the superblock to the left of
+  // it.
+  if (row_index == 0) {
+    return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
+  }
+  // All other superblocks depend on superblock to the left of it (if one
+  // exists) and superblock to the top right with a lag of
+  // |intra_block_copy_lag_| (if one exists).
+  const int top_right_column_index =
+      std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
+  return threading_.sb_state[row_index - 1][top_right_column_index] ==
+             kSuperBlockStateDecoded &&
+         (column_index == 0 ||
+          threading_.sb_state[row_index][column_index - 1] ==
+              kSuperBlockStateDecoded);
+}
+
+void Tile::DecodeSuperBlock(int row_index, int column_index,
+                            int block_width4x4) {
+  const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
+  const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      tile_scratch_buffer_pool_->Get();
+  bool ok = scratch_buffer != nullptr;
+  if (ok) {
+    ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                           kProcessingModeDecodeOnly);
+    tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
+  }
+  std::unique_lock<std::mutex> lock(threading_.mutex);
+  if (ok) {
+    threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
+    // Candidate rows and columns that we could potentially begin the decoding
+    // (if it is allowed to do so). The candidates are:
+    //   1) The superblock to the bottom-left of the current superblock with a
+    //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
+    //   row in case there are less than |intra_block_copy_lag_| superblock
+    //   columns in the Tile).
+    //   2) The superblock to the right of the current superblock.
+    const int candidate_row_indices[] = {row_index + 1, row_index};
+    const int candidate_column_indices[] = {
+        std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
+    for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
+         ++i) {
+      const int candidate_row_index = candidate_row_indices[i];
+      const int candidate_column_index = candidate_column_indices[i];
+      if (!CanDecode(candidate_row_index, candidate_column_index)) {
+        continue;
+      }
+      ++threading_.pending_jobs;
+      threading_.sb_state[candidate_row_index][candidate_column_index] =
+          kSuperBlockStateScheduled;
+      lock.unlock();
+      thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
+                              block_width4x4]() {
+        DecodeSuperBlock(candidate_row_index, candidate_column_index,
+                         block_width4x4);
+      });
+      lock.lock();
+    }
+  } else {
+    threading_.abort = true;
+  }
+  // Finish using |threading_| before |pending_tiles_->Decrement()| because the
+  // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
+  // is called.
+  const bool no_pending_jobs = (--threading_.pending_jobs == 0);
+  const bool job_succeeded = !threading_.abort;
+  lock.unlock();
+  if (no_pending_jobs) {
+    // We are done parsing and decoding this tile.
+    pending_tiles_->Decrement(job_succeeded);
+  }
+}
+
+void Tile::PopulateIntraPredictionBuffer(int row4x4) {
+  const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
+  if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
+    return;
+  }
+  const size_t pixel_size =
+      (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                   : sizeof(uint16_t));
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int row_to_copy =
+        (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
+    const size_t pixels_to_copy =
+        (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
+         subsampling_x_[plane]) *
+        pixel_size;
+    const size_t column_start =
+        MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
+    void* start;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (sequence_header_.color_config.bitdepth > 8) {
+      Array2DView<uint16_t> buffer(
+          buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+          reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+      start = &buffer[row_to_copy][column_start];
+    } else  // NOLINT
+#endif
+    {
+      start = &buffer_[plane][row_to_copy][column_start];
+    }
+    memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
+           start, pixels_to_copy);
+  }
+}
+
+int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
+                                     TransformSize tx_size, int x4, int y4,
+                                     int w4, int h4) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const BlockSize plane_size = block.residual_size[plane];
+  const int block_width = kBlockWidthPixels[plane_size];
+  const int block_height = kBlockHeightPixels[plane_size];
+
+  int top = 0;
+  int left = 0;
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  if (plane == kPlaneY) {
+    if (block_width == tx_width && block_height == tx_height) return 0;
+    const uint8_t* coefficient_levels =
+        &coefficient_levels_[kEntropyContextTop][plane][x4];
+    for (int i = 0; i < num_top_elements; ++i) {
+      top = std::max(top, static_cast<int>(coefficient_levels[i]));
+    }
+    coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+    for (int i = 0; i < num_left_elements; ++i) {
+      left = std::max(left, static_cast<int>(coefficient_levels[i]));
+    }
+    assert(top <= 4);
+    assert(left <= 4);
+    // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
+    // for top and left.
+    return kAllZeroContextsByTopLeft[top][left];
+  }
+  const uint8_t* coefficient_levels =
+      &coefficient_levels_[kEntropyContextTop][plane][x4];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  for (int i = 0; i < num_top_elements; ++i) {
+    top |= coefficient_levels[i];
+    top |= dc_categories[i];
+  }
+  coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  for (int i = 0; i < num_left_elements; ++i) {
+    left |= coefficient_levels[i];
+    left |= dc_categories[i];
+  }
+  return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
+         3 * static_cast<int>(block_width * block_height >
+                              tx_width * tx_height);
+}
+
+TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
+  const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
+  if (is_inter) {
+    if (frame_header_.reduced_tx_set ||
+        tx_size_square_max == kTransformSize32x32) {
+      return kTransformSetInter3;
+    }
+    if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
+    return kTransformSetInter1;
+  }
+  if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
+  if (frame_header_.reduced_tx_set ||
+      tx_size_square_min == kTransformSize16x16) {
+    return kTransformSetIntra2;
+  }
+  return kTransformSetIntra1;
+}
+
+TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
+                                         TransformSize tx_size, int block_x,
+                                         int block_y) {
+  const BlockParameters& bp = *block.bp;
+  const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
+  if (frame_header_.segmentation
+          .lossless[bp.prediction_parameters->segment_id] ||
+      tx_size_square_max == kTransformSize64x64) {
+    return kTransformTypeDctDct;
+  }
+  if (plane == kPlaneY) {
+    return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
+  }
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+  TransformType tx_type;
+  if (bp.is_inter) {
+    const int x4 =
+        std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
+    const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
+    tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
+  } else {
+    tx_type = kModeToTransformType[bp.prediction_parameters->uv_mode];
+  }
+  return kTransformTypeInSetMask[tx_set].Contains(tx_type)
+             ? tx_type
+             : kTransformTypeDctDct;
+}
+
+void Tile::ReadTransformType(const Block& block, int x4, int y4,
+                             TransformSize tx_size) {
+  BlockParameters& bp = *block.bp;
+  const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
+
+  TransformType tx_type = kTransformTypeDctDct;
+  if (tx_set != kTransformSetDctOnly &&
+      frame_header_.segmentation.qindex[bp.prediction_parameters->segment_id] >
+          0) {
+    const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
+    const int cdf_tx_size_index =
+        TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
+    uint16_t* cdf;
+    if (bp.is_inter) {
+      cdf = symbol_decoder_context_
+                .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+      switch (tx_set) {
+        case kTransformSetInter1:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+          break;
+        case kTransformSetInter2:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+          break;
+        default:
+          assert(tx_set == kTransformSetInter3);
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+          break;
+      }
+    } else {
+      const PredictionMode intra_direction =
+          block.bp->prediction_parameters->use_filter_intra
+              ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
+                                                     ->filter_intra_mode]
+              : bp.y_mode;
+      cdf =
+          symbol_decoder_context_
+              .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+      assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+      tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+                                               ? reader_.ReadSymbol<7>(cdf)
+                                               : reader_.ReadSymbol<5>(cdf));
+    }
+
+    // This array does not contain an entry for kTransformSetDctOnly, so the
+    // first dimension needs to be offset by 1.
+    tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
+  }
+  SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
+                   kTransformHeight4x4[tx_size], tx_type, transform_types_);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the two right neighbors and the
+// one bottom-right neighbor may be out of boundary. We don't check the right
+// boundary for them, because the out of boundary neighbors project to positions
+// above the diagonal line which goes through the current coefficient and these
+// positions are still all 0s according to the diagonal scan order.
+template <typename ResidualType>
+void Tile::ReadCoeffBase2D(
+    const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  for (int i = eob - 2; i >= 1; --i) {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+                             levels[tx_width + 1] + levels[2] +
+                             levels[MultiplyBy2(tx_width)];
+    const int context =
+        ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+        kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                          quantized[tx_width] +       // {1, 0}
+                                          quantized[tx_width + 1]));  // {1, 1}
+      context += 14 >> static_cast<int>((row | column) < 2);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+  // Read position 0.
+  {
+    auto* const quantized = &quantized_buffer[0];
+    int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+    level_buffer[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int context =
+          std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                quantized[tx_width] +       // {1, 0}
+                                quantized[tx_width + 1]));  // {1, 1}
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// For a coefficient near the right boundary, the four right neighbors may be
+// out of boundary. We don't do the boundary check for the first three right
+// neighbors, because even for the transform blocks with smallest width 4, the
+// first three out of boundary neighbors project to positions left of the
+// current coefficient and these positions are still all 0s according to the
+// column scan order. However, when transform block width is 4 and the current
+// coefficient is on the right boundary, its fourth right neighbor projects to
+// the under position on the same column, which could be nonzero. Therefore, we
+// must skip the fourth right neighbor. To make it simple, for any coefficient,
+// we always do the boundary check for its fourth right neighbor.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseHorizontal(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (levels[1] +                                  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[2] +                                  // {0, 2}
+             levels[3] +                                  // {0, 3}
+             ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[column];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
+                                          quantized[tx_width] +  // {1, 0}
+                                          quantized[2]));        // {0, 2}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(column == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+// Section 8.3.2 in the spec, under coeff_base and coeff_br.
+// Bottom boundary checks are avoided by the padded rows.
+// Right boundary check is performed explicitly.
+template <typename ResidualType>
+void Tile::ReadCoeffBaseVertical(
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
+    uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
+  const int tx_width = 1 << adjusted_tx_width_log2;
+  int i = eob - 2;
+  do {
+    const uint16_t pos = scan[i];
+    const int row = pos >> adjusted_tx_width_log2;
+    const int column = pos & (tx_width - 1);
+    auto* const quantized = &quantized_buffer[pos];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[MultiplyBy2(tx_width)] +              // {2, 0}
+             levels[tx_width * 3] +                       // {3, 0}
+             levels[MultiplyBy4(tx_width)]);              // {4, 0}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[row];
+    int level =
+        reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
+      int context =
+          std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
+                                quantized[tx_width] +                // {1, 0}
+                                quantized[MultiplyBy2(tx_width)]));  // {2, 0}
+      if (pos != 0) {
+        context += 14 >> static_cast<int>(row == 0);
+      }
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  } while (--i >= 0);
+}
+
+int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
+  // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
+  int8_t dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
+  dc_sign = std::accumulate(
+      dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
+  // This return statement is equivalent to:
+  //   if (dc_sign < 0) return 1;
+  //   if (dc_sign > 0) return 2;
+  //   return 0;
+  // And it is better than:
+  //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
+  return static_cast<int>(dc_sign < 0) +
+         MultiplyBy2(static_cast<int>(dc_sign > 0));
+}
+
+void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
+                              uint8_t coefficient_level, int8_t dc_category) {
+  const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
+  const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
+  memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
+         num_top_elements);
+  memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
+         num_top_elements);
+  const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
+  const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
+  memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
+         coefficient_level, num_left_elements);
+  memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
+         num_left_elements);
+}
+
+template <typename ResidualType, bool is_dc_coefficient>
+bool Tile::ReadSignAndApplyDequantization(
+    const uint16_t* const scan, int i, int q_value,
+    const uint8_t* const quantizer_matrix, int shift, int max_value,
+    uint16_t* const dc_sign_cdf, int8_t* const dc_category,
+    int* const coefficient_level, ResidualType* residual_buffer) {
+  const int pos = is_dc_coefficient ? 0 : scan[i];
+  // If residual_buffer[pos] is zero, then the rest of the function has no
+  // effect.
+  int level = residual_buffer[pos];
+  if (level == 0) return true;
+  const int sign = is_dc_coefficient
+                       ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
+                       : reader_.ReadBit();
+  if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
+    int length = 0;
+    bool golomb_length_bit = false;
+    do {
+      golomb_length_bit = reader_.ReadBit() != 0;
+      ++length;
+      if (length > 20) {
+        LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
+        return false;
+      }
+    } while (!golomb_length_bit);
+    int x = 1;
+    for (int i = length - 2; i >= 0; --i) {
+      x = (x << 1) | reader_.ReadBit();
+    }
+    level += x - 1;
+  }
+  if (is_dc_coefficient) {
+    *dc_category = (sign != 0) ? -1 : 1;
+  }
+  level &= 0xfffff;
+  *coefficient_level += level;
+  // Apply dequantization. Step 1 of section 7.12.3 in the spec.
+  int q = q_value;
+  if (quantizer_matrix != nullptr) {
+    q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
+  }
+  // The intermediate multiplication can exceed 32 bits, so it has to be
+  // performed by promoting one of the values to int64_t.
+  int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
+  dequantized_value >>= shift;
+  // At this point:
+  //   * |dequantized_value| is always non-negative.
+  //   * |sign| can be either 0 or 1.
+  //   * min_value = -(max_value + 1).
+  // We need to apply the following:
+  // dequantized_value = sign ? -dequantized_value : dequantized_value;
+  // dequantized_value = Clip3(dequantized_value, min_value, max_value);
+  //
+  // Note that -x == ~(x - 1).
+  //
+  // Now, The above two lines can be done with a std::min and xor as follows:
+  dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
+  residual_buffer[pos] = dequantized_value;
+  return true;
+}
+
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
+  int level = 0;
+  for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
+    const int coeff_base_range =
+        reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
+    level += coeff_base_range;
+    if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
+  }
+  return level;
+}
+
+template <typename ResidualType>
+int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
+                                    int start_x, int start_y,
+                                    TransformSize tx_size,
+                                    TransformType* const tx_type) {
+  const int x4 = DivideBy4(start_x);
+  const int y4 = DivideBy4(start_y);
+  const int w4 = kTransformWidth4x4[tx_size];
+  const int h4 = kTransformHeight4x4[tx_size];
+  const int tx_size_context = kTransformSizeContext[tx_size];
+  int context =
+      GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
+  const bool all_zero = reader_.ReadSymbol(
+      symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
+  if (all_zero) {
+    if (plane == kPlaneY) {
+      SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
+                       transform_types_);
+    }
+    SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
+    // This is not used in this case, so it can be set to any value.
+    *tx_type = kNumTransformTypes;
+    return 0;
+  }
+  const int tx_width = kTransformWidth[tx_size];
+  const int tx_height = kTransformHeight[tx_size];
+  const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
+  const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
+  const int tx_padding =
+      (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
+  auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
+  // Clear padding to avoid bottom boundary checks when parsing quantized
+  // coefficients.
+  memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+  uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+  memset(
+      level_buffer, 0,
+      kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+          tx_padding);
+  const int clamped_tx_height = std::min(tx_height, 32);
+  if (plane == kPlaneY) {
+    ReadTransformType(block, x4, y4, tx_size);
+  }
+  BlockParameters& bp = *block.bp;
+  *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
+  const int eob_multi_size = kEobMultiSizeLookup[tx_size];
+  const PlaneType plane_type = GetPlaneType(plane);
+  const TransformClass tx_class = GetTransformClass(*tx_type);
+  context = static_cast<int>(tx_class != kTransformClass2D);
+  int eob_pt = 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+          symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
+      break;
+    case 1:
+      eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+          symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
+      break;
+    case 2:
+      eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+          symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
+      break;
+    case 3:
+      eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+          symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
+      break;
+    case 4:
+      eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+          symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
+      break;
+    case 5:
+      eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+          symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
+      break;
+    case 6:
+    default:
+      eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+          symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
+      break;
+  }
+  int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
+  if (eob_pt >= 3) {
+    context = eob_pt - 3;
+    const bool eob_extra = reader_.ReadSymbol(
+        symbol_decoder_context_
+            .eob_extra_cdf[tx_size_context][plane_type][context]);
+    if (eob_extra) eob += 1 << (eob_pt - 3);
+    for (int i = 1; i < eob_pt - 2; ++i) {
+      assert(eob_pt - i >= 3);
+      assert(eob_pt <= kEobPt1024SymbolCount);
+      if (reader_.ReadBit() != 0) {
+        eob += 1 << (eob_pt - i - 3);
+      }
+    }
+  }
+  const uint16_t* scan = kScan[tx_class][tx_size];
+  const int clamped_tx_size_context = std::min(tx_size_context, 3);
+  auto coeff_base_range_cdf =
+      symbol_decoder_context_
+          .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
+  // Read the last coefficient.
+  {
+    context = GetCoeffBaseContextEob(tx_size, eob - 1);
+    const uint16_t pos = scan[eob - 1];
+    int level =
+        1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
+                symbol_decoder_context_
+                    .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+    level_buffer[pos] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      level +=
+          ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+              adjusted_tx_width_log2, pos, tx_class)]);
+    }
+    residual[pos] = level;
+  }
+  if (eob > 1) {
+    // Read all the other coefficients.
+    // Lookup used to call the right variant of ReadCoeffBase*() based on the
+    // transform class.
+    static constexpr void (Tile::*kGetCoeffBaseFunc[])(
+        const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+        int eob,
+        uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
+        uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                     [kCoeffBaseRangeSymbolCount + 1],
+        ResidualType* quantized_buffer,
+        uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+                                  &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+                                  &Tile::ReadCoeffBaseVertical<ResidualType>};
+    (this->*kGetCoeffBaseFunc[tx_class])(
+        scan, tx_size, adjusted_tx_width_log2, eob,
+        symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
+        coeff_base_range_cdf, residual, level_buffer);
+  }
+  const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
+  const int current_quantizer_index =
+      GetQIndex(frame_header_.segmentation,
+                bp.prediction_parameters->segment_id, current_quantizer_index_);
+  const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
+  const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
+  const int shift = kQuantizationShift[tx_size];
+  const uint8_t* const quantizer_matrix =
+      (frame_header_.quantizer.use_matrix &&
+       *tx_type < kTransformTypeIdentityIdentity &&
+       !frame_header_.segmentation
+            .lossless[bp.prediction_parameters->segment_id] &&
+       frame_header_.quantizer.matrix_level[plane] < 15)
+          ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+                             [plane_type][adjusted_tx_size]
+                                 .get()
+          : nullptr;
+  int coefficient_level = 0;
+  int8_t dc_category = 0;
+  uint16_t* const dc_sign_cdf =
+      (residual[0] != 0)
+          ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
+                x4, y4, w4, h4, plane)]
+          : nullptr;
+  assert(scan[0] == 0);
+  if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
+          scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
+          &dc_category, &coefficient_level, residual)) {
+    return -1;
+  }
+  if (eob > 1) {
+    int i = 1;
+    do {
+      if (!ReadSignAndApplyDequantization<ResidualType,
+                                          /*is_dc_coefficient=*/false>(
+              scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
+              nullptr, &coefficient_level, residual)) {
+        return -1;
+      }
+    } while (++i < eob);
+    MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
+  }
+  SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
+                     dc_category);
+  if (split_parse_and_decode_) {
+    *block.residual += tx_width * tx_height * residual_size_;
+  }
+  return eob;
+}
+
+// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
+// |function| depending on the value of |sequence_header_.color_config.bitdepth|
+// with the variadic arguments.
+#if LIBGAV1_MAX_BITDEPTH >= 10
+#define CALL_BITDEPTH_FUNCTION(function, ...)         \
+  do {                                                \
+    if (sequence_header_.color_config.bitdepth > 8) { \
+      function<uint16_t>(__VA_ARGS__);                \
+    } else {                                          \
+      function<uint8_t>(__VA_ARGS__);                 \
+    }                                                 \
+  } while (false)
+#else
+#define CALL_BITDEPTH_FUNCTION(function, ...) \
+  do {                                        \
+    function<uint8_t>(__VA_ARGS__);           \
+  } while (false)
+#endif
+
+bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
+                          int base_y, TransformSize tx_size, int x, int y,
+                          ProcessingMode mode) {
+  BlockParameters& bp = *block.bp;
+  const int subsampling_x = subsampling_x_[plane];
+  const int subsampling_y = subsampling_y_[plane];
+  const int start_x = base_x + MultiplyBy4(x);
+  const int start_y = base_y + MultiplyBy4(y);
+  const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
+  const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
+  if (start_x >= max_x || start_y >= max_y) return true;
+  const int row = DivideBy4(start_y << subsampling_y);
+  const int column = DivideBy4(start_x << subsampling_x);
+  const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
+  const int sub_block_row4x4 = row & mask;
+  const int sub_block_column4x4 = column & mask;
+  const int step_x = kTransformWidth4x4[tx_size];
+  const int step_y = kTransformHeight4x4[tx_size];
+  const bool do_decode = mode == kProcessingModeDecodeOnly ||
+                         mode == kProcessingModeParseAndDecode;
+  if (do_decode && !bp.is_inter) {
+    if (bp.prediction_parameters->palette_mode_info.size[GetPlaneType(plane)] >
+        0) {
+      CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
+                             x, y, tx_size);
+    } else {
+      const PredictionMode mode =
+          (plane == kPlaneY) ? bp.y_mode
+                             : (bp.prediction_parameters->uv_mode ==
+                                        kPredictionModeChromaFromLuma
+                                    ? kPredictionModeDc
+                                    : bp.prediction_parameters->uv_mode);
+      const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + step_x + 1;
+      const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
+      const bool has_left = x > 0 || block.left_available[plane];
+      const bool has_top = y > 0 || block.top_available[plane];
+
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          mode, tx_size);
+      if (plane != kPlaneY &&
+          bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
+        CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
+                               start_y, tx_size);
+      }
+    }
+    if (plane == kPlaneY) {
+      block.bp->prediction_parameters->max_luma_width =
+          start_x + MultiplyBy4(step_x);
+      block.bp->prediction_parameters->max_luma_height =
+          start_y + MultiplyBy4(step_y);
+      block.scratch_buffer->cfl_luma_buffer_valid = false;
+    }
+  }
+  if (!bp.skip) {
+    const int sb_row_index = SuperBlockRowIndex(block.row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
+    if (mode == kProcessingModeDecodeOnly) {
+      Queue<TransformParameters>& tx_params =
+          *residual_buffer_threaded_[sb_row_index][sb_column_index]
+               ->transform_parameters();
+      ReconstructBlock(block, plane, start_x, start_y, tx_size,
+                       tx_params.Front().type,
+                       tx_params.Front().non_zero_coeff_count);
+      tx_params.Pop();
+    } else {
+      TransformType tx_type;
+      int non_zero_coeff_count;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (sequence_header_.color_config.bitdepth > 8) {
+        non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      } else  // NOLINT
+#endif
+      {
+        non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
+            block, plane, start_x, start_y, tx_size, &tx_type);
+      }
+      if (non_zero_coeff_count < 0) return false;
+      if (mode == kProcessingModeParseAndDecode) {
+        ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
+                         non_zero_coeff_count);
+      } else {
+        assert(mode == kProcessingModeParseOnly);
+        residual_buffer_threaded_[sb_row_index][sb_column_index]
+            ->transform_parameters()
+            ->Push(TransformParameters(tx_type, non_zero_coeff_count));
+      }
+    }
+  }
+  if (do_decode) {
+    bool* block_decoded =
+        &block.scratch_buffer
+             ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
+                            [(sub_block_column4x4 >> subsampling_x) + 1];
+    SetBlockValues<bool>(step_y, step_x, true, block_decoded,
+                         TileScratchBuffer::kBlockDecodedStride);
+  }
+  return true;
+}
+
+bool Tile::TransformTree(const Block& block, int start_x, int start_y,
+                         BlockSize plane_size, ProcessingMode mode) {
+  assert(plane_size <= kBlock64x64);
+  // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
+  // required is (4 - 1) * 4 + 1 = 13.
+  Stack<TransformTreeNode, 13> stack;
+  // It is okay to cast BlockSize to TransformSize here since the enum are
+  // equivalent for all BlockSize values <= kBlock64x64.
+  stack.Push(TransformTreeNode(start_x, start_y,
+                               static_cast<TransformSize>(plane_size)));
+
+  do {
+    TransformTreeNode node = stack.Pop();
+    const int row = DivideBy4(node.y);
+    const int column = DivideBy4(node.x);
+    if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
+      continue;
+    }
+    const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
+    const int width = kTransformWidth[node.tx_size];
+    const int height = kTransformHeight[node.tx_size];
+    if (width <= kTransformWidth[inter_tx_size] &&
+        height <= kTransformHeight[inter_tx_size]) {
+      if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
+                          mode)) {
+        return false;
+      }
+      continue;
+    }
+    // The split transform size look up gives the right transform size that we
+    // should push in the stack.
+    //   if (width > height) => transform size whose width is half.
+    //   if (width < height) => transform size whose height is half.
+    //   if (width == height) => transform size whose width and height are half.
+    const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
+    const int half_width = DivideBy2(width);
+    if (width > height) {
+      stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    const int half_height = DivideBy2(height);
+    if (width < height) {
+      stack.Push(
+          TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+      stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+      continue;
+    }
+    stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
+                                 split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
+    stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
+    stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
+                            int start_y, TransformSize tx_size,
+                            TransformType tx_type, int non_zero_coeff_count) {
+  // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
+  assert(non_zero_coeff_count >= 0);
+  if (non_zero_coeff_count == 0) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (sequence_header_.color_config.bitdepth > 8) {
+    Array2DView<uint16_t> buffer(
+        buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
+        reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation
+                    .lossless[block.bp->prediction_parameters->segment_id],
+                reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
+                &buffer, non_zero_coeff_count);
+  } else  // NOLINT
+#endif
+  {
+    Reconstruct(dsp_, tx_type, tx_size,
+                frame_header_.segmentation
+                    .lossless[block.bp->prediction_parameters->segment_id],
+                reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
+                &buffer_[plane], non_zero_coeff_count);
+  }
+  if (split_parse_and_decode_) {
+    *block.residual +=
+        kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
+  }
+}
+
+bool Tile::Residual(const Block& block, ProcessingMode mode) {
+  const int width_chunks = std::max(1, block.width >> 6);
+  const int height_chunks = std::max(1, block.height >> 6);
+  const BlockSize size_chunk4x4 =
+      (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
+  const BlockParameters& bp = *block.bp;
+  for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
+    for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
+      const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+      int plane = kPlaneY;
+      do {
+        const int subsampling_x = subsampling_x_[plane];
+        const int subsampling_y = subsampling_y_[plane];
+        // For Y Plane, when lossless is true |bp.transform_size| is always
+        // kTransformSize4x4. So we can simply use |bp.transform_size| here as
+        // the Y plane's transform size (part of Section 5.11.37 in the spec).
+        const TransformSize tx_size =
+            (plane == kPlaneY)
+                ? inter_transform_sizes_[block.row4x4][block.column4x4]
+                : bp.uv_transform_size;
+        const BlockSize plane_size =
+            kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
+        assert(plane_size != kBlockInvalid);
+        if (bp.is_inter &&
+            !frame_header_.segmentation
+                 .lossless[bp.prediction_parameters->segment_id] &&
+            plane == kPlaneY) {
+          const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
+          const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
+          const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
+          if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
+            return false;
+          }
+        } else {
+          const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+          const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+          const int step_x = kTransformWidth4x4[tx_size];
+          const int step_y = kTransformHeight4x4[tx_size];
+          const int num4x4_wide = kNum4x4BlocksWide[plane_size];
+          const int num4x4_high = kNum4x4BlocksHigh[plane_size];
+          for (int y = 0; y < num4x4_high; y += step_y) {
+            for (int x = 0; x < num4x4_wide; x += step_x) {
+              if (!TransformBlock(
+                      block, static_cast<Plane>(plane), base_x, base_y, tx_size,
+                      x + (MultiplyBy16(chunk_x) >> subsampling_x),
+                      y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
+                return false;
+              }
+            }
+          }
+        }
+      } while (++plane < num_planes);
+    }
+  }
+  return true;
+}
+
+// The purpose of this function is to limit the maximum size of motion vectors
+// and also, if use_intra_block_copy is true, to additionally constrain the
+// motion vector so that the data is fetched from parts of the tile that have
+// already been decoded and are not too close to the current block (in order to
+// make a pipelined decoder implementation feasible).
+bool Tile::IsMvValid(const Block& block, bool is_compound) const {
+  const BlockParameters& bp = *block.bp;
+  for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
+    for (int mv_component : bp.mv.mv[i].mv) {
+      if (std::abs(mv_component) >= (1 << 14)) {
+        return false;
+      }
+    }
+  }
+  if (!block.bp->prediction_parameters->use_intra_block_copy) {
+    return true;
+  }
+  if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
+    return false;
+  }
+  const int delta_row = bp.mv.mv[0].mv[0] >> 3;
+  const int delta_column = bp.mv.mv[0].mv[1] >> 3;
+  int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
+  int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
+  const int src_bottom_edge = src_top_edge + block.height;
+  const int src_right_edge = src_left_edge + block.width;
+  if (block.HasChroma()) {
+    if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
+      src_left_edge -= 4;
+    }
+    if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
+      src_top_edge -= 4;
+    }
+  }
+  if (src_top_edge < MultiplyBy4(row4x4_start_) ||
+      src_left_edge < MultiplyBy4(column4x4_start_) ||
+      src_bottom_edge > MultiplyBy4(row4x4_end_) ||
+      src_right_edge > MultiplyBy4(column4x4_end_)) {
+    return false;
+  }
+  // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
+  const int sb_height_log2 =
+      6 + static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
+  const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
+  const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
+  const int src_64x64_block_column = (src_right_edge - 1) >> 6;
+  const int total_64x64_blocks_per_row =
+      ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
+  const int active_64x64_block =
+      active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
+  const int src_64x64_block =
+      src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
+  if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
+    return false;
+  }
+
+  // Wavefront constraint: use only top left area of frame for reference.
+  if (src_sb_row > active_sb_row) return false;
+  const int gradient =
+      1 + kIntraBlockCopyDelay64x64Blocks +
+      static_cast<int>(sequence_header_.use_128x128_superblock);
+  const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
+  return src_64x64_block_column < active_64x64_block_column -
+                                      kIntraBlockCopyDelay64x64Blocks +
+                                      wavefront_offset;
+}
+
+bool Tile::AssignInterMv(const Block& block, bool is_compound) {
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  bp.mv.mv64 = 0;
+  if (is_compound) {
+    for (int i = 0; i < 2; ++i) {
+      const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
+      MotionVector predicted_mv;
+      if (mode == kPredictionModeGlobalMv) {
+        predicted_mv = prediction_parameters.global_mv[i];
+      } else {
+        const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                  (mode == kPredictionModeNewMv &&
+                                   prediction_parameters.ref_mv_count <= 1))
+                                     ? 0
+                                     : prediction_parameters.ref_mv_index;
+        predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
+        if (ref_mv_index < prediction_parameters.ref_mv_count) {
+          predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+          predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+        }
+      }
+      if (mode == kPredictionModeNewMv) {
+        ReadMotionVector(block, i);
+        bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
+        bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
+      } else {
+        bp.mv.mv[i] = predicted_mv;
+      }
+    }
+  } else {
+    const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
+    MotionVector predicted_mv;
+    if (mode == kPredictionModeGlobalMv) {
+      predicted_mv = prediction_parameters.global_mv[0];
+    } else {
+      const int ref_mv_index = (mode == kPredictionModeNearestMv ||
+                                (mode == kPredictionModeNewMv &&
+                                 prediction_parameters.ref_mv_count <= 1))
+                                   ? 0
+                                   : prediction_parameters.ref_mv_index;
+      predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
+      if (ref_mv_index < prediction_parameters.ref_mv_count) {
+        predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
+        predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
+      }
+    }
+    if (mode == kPredictionModeNewMv) {
+      ReadMotionVector(block, 0);
+      bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
+      bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
+    } else {
+      bp.mv.mv[0] = predicted_mv;
+    }
+  }
+  return IsMvValid(block, is_compound);
+}
+
+bool Tile::AssignIntraMv(const Block& block) {
+  // TODO(linfengz): Check if the clamping process is necessary.
+  int min[2];
+  int max[2];
+  GetClampParameters(block, min, max);
+  BlockParameters& bp = *block.bp;
+  const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  bp.mv.mv64 = 0;
+  ReadMotionVector(block, 0);
+  if (ref_mv_0.mv32 == 0) {
+    const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
+    if (ref_mv_1.mv32 == 0) {
+      const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
+      if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
+        bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
+        bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
+      } else {
+        bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
+      }
+    } else {
+      bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
+      bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
+    }
+  } else {
+    bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
+    bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
+  }
+  return IsMvValid(block, /*is_compound=*/false);
+}
+
+void Tile::ResetEntropyContext(const Block& block) {
+  const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+  int plane = kPlaneY;
+  do {
+    const int subsampling_x = subsampling_x_[plane];
+    const int start_x = block.column4x4 >> subsampling_x;
+    const int end_x =
+        std::min((block.column4x4 + block.width4x4) >> subsampling_x,
+                 frame_header_.columns4x4);
+    memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
+           end_x - start_x);
+    const int subsampling_y = subsampling_y_[plane];
+    const int start_y = block.row4x4 >> subsampling_y;
+    const int end_y =
+        std::min((block.row4x4 + block.height4x4) >> subsampling_y,
+                 frame_header_.rows4x4);
+    memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+    memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
+           end_y - start_y);
+  } while (++plane < num_planes);
+}
+
+bool Tile::ComputePrediction(const Block& block) {
+  const BlockParameters& bp = *block.bp;
+  if (!bp.is_inter) return true;
+  const int mask =
+      (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
+      1;
+  const int sub_block_row4x4 = block.row4x4 & mask;
+  const int sub_block_column4x4 = block.column4x4 & mask;
+  const int plane_count = block.HasChroma() ? PlaneCount() : 1;
+  // Returns true if this block applies local warping. The state is determined
+  // in the Y plane and carried for use in the U/V planes.
+  // But the U/V planes will not apply warping when the block size is smaller
+  // than 8x8, even if this variable is true.
+  bool is_local_valid = false;
+  // Local warping parameters, similar usage as is_local_valid.
+  GlobalMotion local_warp_params;
+  int plane = kPlaneY;
+  do {
+    const int8_t subsampling_x = subsampling_x_[plane];
+    const int8_t subsampling_y = subsampling_y_[plane];
+    const BlockSize plane_size = block.residual_size[plane];
+    const int block_width4x4 = kNum4x4BlocksWide[plane_size];
+    const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
+    const int block_width = MultiplyBy4(block_width4x4);
+    const int block_height = MultiplyBy4(block_height4x4);
+    const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
+    const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
+    if (bp.reference_frame[1] == kReferenceFrameIntra) {
+      const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
+      const int tr_column4x4 =
+          (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
+      const int bl_row4x4 =
+          (sub_block_row4x4 >> subsampling_y) + block_height4x4;
+      const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
+      const TransformSize tx_size =
+          k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
+                                 [k4x4HeightLog2[plane_size]];
+      const bool has_left = block.left_available[plane];
+      const bool has_top = block.top_available[plane];
+      CALL_BITDEPTH_FUNCTION(
+          IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
+          has_left, has_top,
+          block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
+          block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
+          kInterIntraToIntraMode[block.bp->prediction_parameters
+                                     ->inter_intra_mode],
+          tx_size);
+    }
+    int candidate_row = block.row4x4;
+    int candidate_column = block.column4x4;
+    bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
+    if (!some_use_intra && plane != 0) {
+      candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
+      candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
+      if (candidate_row != block.row4x4) {
+        // Top block.
+        const BlockParameters& bp_top =
+            *block_parameters_holder_.Find(candidate_row, block.column4x4);
+        some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
+        if (!some_use_intra && candidate_column != block.column4x4) {
+          // Top-left block.
+          const BlockParameters& bp_top_left =
+              *block_parameters_holder_.Find(candidate_row, candidate_column);
+          some_use_intra =
+              bp_top_left.reference_frame[0] == kReferenceFrameIntra;
+        }
+      }
+      if (!some_use_intra && candidate_column != block.column4x4) {
+        // Left block.
+        const BlockParameters& bp_left =
+            *block_parameters_holder_.Find(block.row4x4, candidate_column);
+        some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
+      }
+    }
+    int prediction_width;
+    int prediction_height;
+    if (some_use_intra) {
+      candidate_row = block.row4x4;
+      candidate_column = block.column4x4;
+      prediction_width = block_width;
+      prediction_height = block_height;
+    } else {
+      prediction_width = block.width >> subsampling_x;
+      prediction_height = block.height >> subsampling_y;
+    }
+    int r = 0;
+    int y = 0;
+    do {
+      int c = 0;
+      int x = 0;
+      do {
+        if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
+                             base_y + y, prediction_width, prediction_height,
+                             candidate_row + r, candidate_column + c,
+                             &is_local_valid, &local_warp_params)) {
+          return false;
+        }
+        ++c;
+        x += prediction_width;
+      } while (x < block_width);
+      ++r;
+      y += prediction_height;
+    } while (y < block_height);
+  } while (++plane < plane_count);
+  return true;
+}
+
+#undef CALL_BITDEPTH_FUNCTION
+
+void Tile::PopulateDeblockFilterLevel(const Block& block) {
+  if (!post_filter_.DoDeblock()) return;
+  BlockParameters& bp = *block.bp;
+  const int mode_id =
+      static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
+  for (int i = 0; i < kFrameLfCount; ++i) {
+    if (delta_lf_all_zero_) {
+      bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
+          bp.prediction_parameters->segment_id, i, bp.reference_frame[0],
+          mode_id);
+    } else {
+      bp.deblock_filter_level[i] =
+          deblock_filter_levels_[bp.prediction_parameters->segment_id][i]
+                                [bp.reference_frame[0]][mode_id];
+    }
+  }
+}
+
+void Tile::PopulateCdefSkip(const Block& block) {
+  if (!post_filter_.DoCdef() || block.bp->skip ||
+      (frame_header_.cdef.bits > 0 &&
+       cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)] ==
+           -1)) {
+    return;
+  }
+  // The rest of this function is an efficient version of the following code:
+  // for (int y = block.row4x4; y < block.row4x4 + block.height4x4; y++) {
+  //   for (int x = block.column4x4; y < block.column4x4 + block.width4x4;
+  //        x++) {
+  //     const uint8_t mask = uint8_t{1} << ((x >> 1) & 0x7);
+  //     cdef_skip_[y >> 1][x >> 4] |= mask;
+  //   }
+  // }
+
+  // For all block widths other than 32, the mask will fit in uint8_t. For
+  // block width == 32, the mask is always 0xFFFF.
+  const int bw4 =
+      std::max(DivideBy2(block.width4x4) + (block.column4x4 & 1), 1);
+  const uint8_t mask = (block.width4x4 == 32)
+                           ? 0xFF
+                           : (uint8_t{0xFF} >> (8 - bw4))
+                                 << (DivideBy2(block.column4x4) & 0x7);
+  uint8_t* cdef_skip = &cdef_skip_[block.row4x4 >> 1][block.column4x4 >> 4];
+  const int stride = cdef_skip_.columns();
+  int row = 0;
+  do {
+    *cdef_skip |= mask;
+    if (block.width4x4 == 32) {
+      *(cdef_skip + 1) = 0xFF;
+    }
+    cdef_skip += stride;
+    row += 2;
+  } while (row < block.height4x4);
+}
+
+bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
+                        TileScratchBuffer* const scratch_buffer,
+                        ResidualPtr* residual) {
+  // Do not process the block if the starting point is beyond the visible frame.
+  // This is equivalent to the has_row/has_column check in the
+  // decode_partition() section of the spec when partition equals
+  // kPartitionHorizontal or kPartitionVertical.
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+
+  if (split_parse_and_decode_) {
+    // Push block ordering info to the queue. DecodeBlock() will use this queue
+    // to decode the blocks in the correct order.
+    const int sb_row_index = SuperBlockRowIndex(row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(column4x4);
+    residual_buffer_threaded_[sb_row_index][sb_column_index]
+        ->partition_tree_order()
+        ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+  }
+
+  BlockParameters* bp_ptr =
+      block_parameters_holder_.Get(row4x4, column4x4, block_size);
+  if (bp_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+    return false;
+  }
+  BlockParameters& bp = *bp_ptr;
+  Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  bp.size = block_size;
+  bp.prediction_parameters =
+      split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
+                                    new (std::nothrow) PredictionParameters())
+                              : std::move(prediction_parameters_);
+  if (bp.prediction_parameters == nullptr) return false;
+  if (!DecodeModeInfo(block)) return false;
+  PopulateDeblockFilterLevel(block);
+  if (!ReadPaletteTokens(block)) return false;
+  DecodeTransformSize(block);
+  // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
+  bp.uv_transform_size =
+      frame_header_.segmentation.lossless[bp.prediction_parameters->segment_id]
+          ? kTransformSize4x4
+          : kUVTransformSize[block.residual_size[kPlaneU]];
+  if (bp.skip) ResetEntropyContext(block);
+  PopulateCdefSkip(block);
+  if (split_parse_and_decode_) {
+    if (!Residual(block, kProcessingModeParseOnly)) return false;
+  } else {
+    if (!ComputePrediction(block) ||
+        !Residual(block, kProcessingModeParseAndDecode)) {
+      return false;
+    }
+  }
+  // If frame_header_.segmentation.enabled is false,
+  // bp.prediction_parameters->segment_id is 0 for all blocks. We don't need to
+  // call save bp.prediction_parameters->segment_id in the current frame because
+  // the current frame's segmentation map will be cleared to all 0s.
+  //
+  // If frame_header_.segmentation.enabled is true and
+  // frame_header_.segmentation.update_map is false, we will copy the previous
+  // frame's segmentation map to the current frame. So we don't need to call
+  // save bp.prediction_parameters->segment_id in the current frame.
+  if (frame_header_.segmentation.enabled &&
+      frame_header_.segmentation.update_map) {
+    const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
+                                 static_cast<int>(block.width4x4));
+    const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
+                                 static_cast<int>(block.height4x4));
+    current_frame_.segmentation_map()->FillBlock(
+        row4x4, column4x4, x_limit, y_limit,
+        bp.prediction_parameters->segment_id);
+  }
+  StoreMotionFieldMvsIntoCurrentFrame(block);
+  if (!split_parse_and_decode_) {
+    prediction_parameters_ = std::move(bp.prediction_parameters);
+  }
+  return true;
+}
+
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                       TileScratchBuffer* const scratch_buffer,
+                       ResidualPtr* residual) {
+  if (row4x4 >= frame_header_.rows4x4 ||
+      column4x4 >= frame_header_.columns4x4) {
+    return true;
+  }
+  Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
+  if (!ComputePrediction(block) ||
+      !Residual(block, kProcessingModeDecodeOnly)) {
+    return false;
+  }
+  block.bp->prediction_parameters.reset(nullptr);
+  return true;
+}
+
+bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
+                            TileScratchBuffer* const scratch_buffer,
+                            ResidualPtr* residual) {
+  Stack<PartitionTreeNode, kDfsStackSize> stack;
+
+  // Set up the first iteration.
+  stack.Push(
+      PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
+
+  // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
+  // Otherwise, the children are pushed into the stack for future processing.
+  do {
+    PartitionTreeNode node = stack.Pop();
+    int row4x4 = node.row4x4;
+    int column4x4 = node.column4x4;
+    BlockSize block_size = node.block_size;
+
+    if (row4x4 >= frame_header_.rows4x4 ||
+        column4x4 >= frame_header_.columns4x4) {
+      continue;
+    }
+    const int block_width4x4 = kNum4x4BlocksWide[block_size];
+    assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
+    const int half_block4x4 = block_width4x4 >> 1;
+    const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
+    const bool has_columns =
+        (column4x4 + half_block4x4) < frame_header_.columns4x4;
+    Partition partition;
+    if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
+                       &partition)) {
+      LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    const BlockSize sub_size = kSubSize[partition][block_size];
+    // Section 6.10.4: It is a requirement of bitstream conformance that
+    // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
+    // every time subSize is computed.
+    if (sub_size == kBlockInvalid ||
+        kPlaneResidualSize[sub_size]
+                          [sequence_header_.color_config.subsampling_x]
+                          [sequence_header_.color_config.subsampling_y] ==
+            kBlockInvalid) {
+      LIBGAV1_DLOG(
+          ERROR,
+          "Invalid sub-block/plane size for row: %d column: %d partition: "
+          "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
+          row4x4, column4x4, partition, block_size, sub_size,
+          sequence_header_.color_config.subsampling_x,
+          sequence_header_.color_config.subsampling_y);
+      return false;
+    }
+
+    const int quarter_block4x4 = half_block4x4 >> 1;
+    const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+    assert(partition == kPartitionNone || sub_size != kBlockInvalid);
+    switch (partition) {
+      case kPartitionNone:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual)) {
+          return false;
+        }
+        break;
+      case kPartitionSplit:
+        // The children must be added in reverse order since a stack is being
+        // used.
+        stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+                                     column4x4 + half_block4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+        stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
+        break;
+      case kPartitionHorizontal:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVertical:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontalWithTopSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontalWithBottomSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVerticalWithLeftSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionVerticalWithRightSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
+      case kPartitionHorizontal4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
+      case kPartitionVertical4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
+    }
+  } while (!stack.Empty());
+  return true;
+}
+
+void Tile::ResetLoopRestorationParams() {
+  for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
+    for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
+      reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
+          kSgrProjDefaultMultiplier[i];
+      for (int j = 0; j < kNumWienerCoefficients; ++j) {
+        reference_unit_info_[plane].wiener_info.filter[i][j] =
+            kWienerDefaultFilter[j];
+      }
+    }
+  }
+}
+
+void Tile::ResetCdef(const int row4x4, const int column4x4) {
+  if (frame_header_.cdef.bits == 0) return;
+  const int row = DivideBy16(row4x4);
+  const int column = DivideBy16(column4x4);
+  cdef_index_[row][column] = -1;
+  if (sequence_header_.use_128x128_superblock) {
+    const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
+    const int border_row = DivideBy16(row4x4 + cdef_size4x4);
+    const int border_column = DivideBy16(column4x4 + cdef_size4x4);
+    cdef_index_[row][border_column] = -1;
+    cdef_index_[border_row][column] = -1;
+    cdef_index_[border_row][border_column] = -1;
+  }
+}
+
+void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
+                             int row4x4, int column4x4) {
+  // Set everything to false.
+  memset(scratch_buffer->block_decoded, 0,
+         sizeof(scratch_buffer->block_decoded));
+  // Set specific edge cases to true.
+  const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    const int subsampling_x = subsampling_x_[plane];
+    const int subsampling_y = subsampling_y_[plane];
+    const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
+    const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
+    // The memset is equivalent to the following lines in the spec:
+    // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
+    //   if ( y < 0 && x < sbWidth4 ) {
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    const int num_elements =
+        std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
+    memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
+    // The for loop is equivalent to the following lines in the spec:
+    // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
+    //   if ( x < 0 && y < sbHeight4 )
+    //     BlockDecoded[plane][y][x] = 1
+    //   }
+    // }
+    // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
+    for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
+         ++y) {
+      scratch_buffer->block_decoded[plane][y + 1][0] = true;
+    }
+  }
+}
+
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
+                             TileScratchBuffer* const scratch_buffer,
+                             ProcessingMode mode) {
+  const bool parsing =
+      mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
+  const bool decoding = mode == kProcessingModeDecodeOnly ||
+                        mode == kProcessingModeParseAndDecode;
+  if (parsing) {
+    read_deltas_ = frame_header_.delta_q.present;
+    ResetCdef(row4x4, column4x4);
+  }
+  if (decoding) {
+    ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
+  }
+  const BlockSize block_size = SuperBlockSize();
+  if (parsing) {
+    ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
+  }
+  if (parsing && decoding) {
+    uint8_t* residual_buffer = residual_buffer_.get();
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+    return true;
+  }
+  const int sb_row_index = SuperBlockRowIndex(row4x4);
+  const int sb_column_index = SuperBlockColumnIndex(column4x4);
+  if (parsing) {
+    residual_buffer_threaded_[sb_row_index][sb_column_index] =
+        residual_buffer_pool_->Get();
+    if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
+      return false;
+    }
+    uint8_t* residual_buffer =
+        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
+                   column4x4);
+      return false;
+    }
+  } else {
+    if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
+                   row4x4, column4x4);
+      return false;
+    }
+    residual_buffer_pool_->Release(
+        std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
+  }
+  return true;
+}
+
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                            TileScratchBuffer* const scratch_buffer) {
+  uint8_t* residual_buffer =
+      residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+  Queue<PartitionTreeNode>& partition_tree_order =
+      *residual_buffer_threaded_[sb_row_index][sb_column_index]
+           ->partition_tree_order();
+  while (!partition_tree_order.Empty()) {
+    PartitionTreeNode block = partition_tree_order.Front();
+    if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+                     scratch_buffer, &residual_buffer)) {
+      LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
+                   block.row4x4, block.column4x4);
+      return false;
+    }
+    partition_tree_order.Pop();
+  }
+  return true;
+}
+
+void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
+                                           BlockSize block_size) {
+  if (frame_header_.allow_intrabc) return;
+  LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
+  const bool is_superres_scaled =
+      frame_header_.width != frame_header_.upscaled_width;
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
+    LoopRestorationUnitInfo unit_info;
+    if (restoration_info->PopulateUnitInfoForSuperBlock(
+            static_cast<Plane>(plane), block_size, is_superres_scaled,
+            frame_header_.superres_scale_denominator, row4x4, column4x4,
+            &unit_info)) {
+      for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
+           ++unit_row) {
+        for (int unit_column = unit_info.column_start;
+             unit_column < unit_info.column_end; ++unit_column) {
+          const int unit_id = unit_row * restoration_info->num_horizontal_units(
+                                             static_cast<Plane>(plane)) +
+                              unit_column;
+          restoration_info->ReadUnitCoefficients(
+              &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
+              unit_id, &reference_unit_info_);
+        }
+      }
+    }
+  }
+}
+
+void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
+  if (frame_header_.refresh_frame_flags == 0 ||
+      IsIntraFrame(frame_header_.frame_type)) {
+    return;
+  }
+  // Iterate over odd rows/columns beginning at the first odd row/column for the
+  // block. It is done this way because motion field mvs are only needed at a
+  // 8x8 granularity.
+  const int row_start4x4 = block.row4x4 | 1;
+  const int row_limit4x4 =
+      std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
+  if (row_start4x4 >= row_limit4x4) return;
+  const int column_start4x4 = block.column4x4 | 1;
+  const int column_limit4x4 =
+      std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
+  if (column_start4x4 >= column_limit4x4) return;
+
+  // The largest reference MV component that can be saved.
+  constexpr int kRefMvsLimit = (1 << 12) - 1;
+  const BlockParameters& bp = *block.bp;
+  ReferenceInfo* reference_info = current_frame_.reference_info();
+  for (int i = 1; i >= 0; --i) {
+    const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
+    // Must make a local copy so that StoreMotionFieldMvs() knows there is no
+    // overlap between load and store.
+    const MotionVector mv_to_store = bp.mv.mv[i];
+    const int mv_row = std::abs(mv_to_store.mv[0]);
+    const int mv_column = std::abs(mv_to_store.mv[1]);
+    if (reference_frame_to_store > kReferenceFrameIntra &&
+        // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
+        // absolute values and then compare with kRefMvsLimit to save a branch.
+        // The next line is equivalent to:
+        // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
+        (mv_row | mv_column) <= kRefMvsLimit &&
+        reference_info->relative_distance_from[reference_frame_to_store] < 0) {
+      const int row_start8x8 = DivideBy2(row_start4x4);
+      const int row_limit8x8 = DivideBy2(row_limit4x4);
+      const int column_start8x8 = DivideBy2(column_start4x4);
+      const int column_limit8x8 = DivideBy2(column_limit4x4);
+      const int rows = row_limit8x8 - row_start8x8;
+      const int columns = column_limit8x8 - column_start8x8;
+      const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
+      ReferenceFrameType* const reference_frame_row_start =
+          &reference_info
+               ->motion_field_reference_frame[row_start8x8][column_start8x8];
+      MotionVector* const mv =
+          &reference_info->motion_field_mv[row_start8x8][column_start8x8];
+
+      // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
+      // and simplifies std::fill() for these cases.
+      if (columns <= 1) {
+        // Don't change the above condition to (columns == 1).
+        // Condition (columns <= 1) may help the compiler simplify the inlining
+        // of the general case of StoreMotionFieldMvs() by eliminating the
+        // (columns == 0) case.
+        assert(columns == 1);
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            1, reference_frame_row_start, mv);
+      } else if (columns == 2) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            2, reference_frame_row_start, mv);
+      } else if (columns == 4) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            4, reference_frame_row_start, mv);
+      } else if (columns == 8) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            8, reference_frame_row_start, mv);
+      } else if (columns == 16) {
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            16, reference_frame_row_start, mv);
+      } else if (columns < 16) {
+        // This always true condition (columns < 16) may help the compiler
+        // simplify the inlining of the following function.
+        // This general case is rare and usually only happens to the blocks
+        // which contain the right boundary of the frame.
+        StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
+                            columns, reference_frame_row_start, mv);
+      } else {
+        assert(false);
+      }
+      return;
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.cc b/src/tile_scratch_buffer.cc
new file mode 100644
index 0000000..0b5ac96
--- /dev/null
+++ b/src/tile_scratch_buffer.cc
@@ -0,0 +1,26 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/tile_scratch_buffer.h"
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+#if !LIBGAV1_CXX17
+// static
+constexpr int TileScratchBuffer::kBlockDecodedStride;
+#endif
+
+}  // namespace libgav1
diff --git a/src/tile_scratch_buffer.h b/src/tile_scratch_buffer.h
new file mode 100644
index 0000000..828f550
--- /dev/null
+++ b/src/tile_scratch_buffer.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
+#include <new>
+#include <utility>
+
+#include "src/dsp/constants.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/stack.h"
+
+namespace libgav1 {
+
+// Buffer to facilitate decoding a superblock.
+struct TileScratchBuffer : public MaxAlignedAllocable {
+  static constexpr int kBlockDecodedStride = 34;
+
+  LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    const int pixel_size = (bitdepth == 8) ? 1 : 2;
+#else
+    assert(bitdepth == 8);
+    static_cast<void>(bitdepth);
+    const int pixel_size = 1;
+#endif
+
+    static_assert(kConvolveScaleBorderRight >= kConvolveBorderRight, "");
+    constexpr int unaligned_convolve_buffer_stride =
+        kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop +
+        kConvolveScaleBorderRight;
+    convolve_block_buffer_stride = Align<ptrdiff_t>(
+        unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment);
+    constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels +
+                                           kConvolveBorderLeftTop +
+                                           kConvolveBorderBottom;
+
+    convolve_block_buffer = MakeAlignedUniquePtr<uint8_t>(
+        kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride);
+#if LIBGAV1_MSAN
+    // Quiet msan warnings in ConvolveScale2D_NEON(). Set with random non-zero
+    // value to aid in future debugging.
+    memset(convolve_block_buffer.get(), 0x66,
+           convolve_buffer_height * convolve_block_buffer_stride);
+#endif
+
+    return convolve_block_buffer != nullptr;
+  }
+
+  // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the
+  // prediction block size. This buffer is used to store that mask. The masks
+  // will be created for the Y plane and will be re-used for the U & V planes.
+  alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels];
+
+  // For each instance of the TileScratchBuffer, only one of the following
+  // buffers will be used at any given time, so it is ok to share them in a
+  // union.
+  union {
+    // Buffers used for prediction process.
+    // Compound prediction calculations always output 16-bit values. Depending
+    // on the bitdepth the values may be treated as int16_t or uint16_t. See
+    // src/dsp/convolve.cc and src/dsp/warp.cc for explanations.
+    // Inter/intra calculations output Pixel values.
+    // These buffers always use width as the stride. This enables packing the
+    // values in and simplifies loads/stores for small values.
+
+    // 10/12 bit compound prediction and 10/12 bit inter/intra prediction.
+    alignas(kMaxAlignment) uint16_t
+        prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
+    // 8 bit compound prediction buffer.
+    alignas(kMaxAlignment) int16_t
+        compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels];
+
+    // Union usage note: This is used only by functions in the "intra"
+    // prediction path.
+    //
+    // Buffer used for storing subsampled luma samples needed for CFL
+    // prediction. This buffer is used to avoid repetition of the subsampling
+    // for the V plane when it is already done for the U plane.
+    int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
+  };
+
+  // Buffer used for convolve. The maximum size required for this buffer is:
+  //  maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263.
+  //  maximum block stride (with scaling and border aligned to 16) =
+  //     (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size.
+  //  Where pixel_size is (bitdepth == 8) ? 1 : 2.
+  // Has an alignment of kMaxAlignment when allocated.
+  AlignedUniquePtr<uint8_t> convolve_block_buffer;
+  ptrdiff_t convolve_block_buffer_stride;
+
+  // Flag indicating whether the data in |cfl_luma_buffer| is valid.
+  bool cfl_luma_buffer_valid;
+
+  // Equivalent to BlockDecoded array in the spec. This stores the decoded
+  // state of every 4x4 block in a superblock. It has 1 row/column border on
+  // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
+  // spec uses "-1" as an index to access the left and top borders. In the
+  // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
+  // all accesses into this array will be offset by +1 when compared with the
+  // spec.
+  bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
+};
+
+class TileScratchBufferPool {
+ public:
+  void Reset(int bitdepth) {
+    if (bitdepth_ == bitdepth) return;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth_ == 8 && bitdepth != 8) {
+      // We are going from a pixel size of 1 to a pixel size of 2. So invalidate
+      // the stack.
+      std::lock_guard<std::mutex> lock(mutex_);
+      while (!buffers_.Empty()) {
+        buffers_.Pop();
+      }
+    }
+#endif
+    bitdepth_ = bitdepth;
+  }
+
+  std::unique_ptr<TileScratchBuffer> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (buffers_.Empty()) {
+      std::unique_ptr<TileScratchBuffer> scratch_buffer(new (std::nothrow)
+                                                            TileScratchBuffer);
+      if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) {
+        return nullptr;
+      }
+      return scratch_buffer;
+    }
+    return buffers_.Pop();
+  }
+
+  void Release(std::unique_ptr<TileScratchBuffer> scratch_buffer) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffers_.Push(std::move(scratch_buffer));
+  }
+
+ private:
+  std::mutex mutex_;
+  // We will never need more than kMaxThreads scratch buffers since that is the
+  // maximum amount of work that will be done at any given time.
+  Stack<std::unique_ptr<TileScratchBuffer>, kMaxThreads> buffers_
+      LIBGAV1_GUARDED_BY(mutex_);
+  int bitdepth_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_
diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h
new file mode 100644
index 0000000..df2da9f
--- /dev/null
+++ b/src/utils/array_2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Exposes a 1D allocated memory buffer as a 2D array.
+template <typename T>
+class Array2DView {
+ public:
+  Array2DView() = default;
+  Array2DView(int rows, int columns, T* const data) {
+    Reset(rows, columns, data);
+  }
+
+  // Copyable and Movable.
+  Array2DView(const Array2DView& rhs) = default;
+  Array2DView& operator=(const Array2DView& rhs) = default;
+
+  void Reset(int rows, int columns, T* const data) {
+    rows_ = rows;
+    columns_ = columns;
+    data_ = data;
+  }
+
+  int rows() const { return rows_; }
+  int columns() const { return columns_; }
+
+  T* operator[](int row) { return const_cast<T*>(GetRow(row)); }
+
+  const T* operator[](int row) const { return GetRow(row); }
+
+ private:
+  const T* GetRow(int row) const {
+    assert(row < rows_);
+    const ptrdiff_t offset = static_cast<ptrdiff_t>(row) * columns_;
+    return data_ + offset;
+  }
+
+  int rows_ = 0;
+  int columns_ = 0;
+  T* data_ = nullptr;
+};
+
+// Allocates and owns the contiguous memory and exposes an Array2DView of
+// dimension |rows| x |columns|.
+template <typename T>
+class Array2D {
+ public:
+  Array2D() = default;
+
+  // Copyable and Movable.
+  Array2D(const Array2D& rhs) = default;
+  Array2D& operator=(const Array2D& rhs) = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns,
+                                     bool zero_initialize = true) {
+    size_ = rows * columns;
+    // If T is not a trivial type, we should always reallocate the data_
+    // buffer, so that the destructors of any existing objects are invoked.
+    if (!std::is_trivial<T>::value || allocated_size_ < size_) {
+      // Note: This invokes the global operator new if T is a non-class type,
+      // such as integer or enum types, or a class type that is not derived
+      // from libgav1::Allocable, such as std::unique_ptr. If we enforce a
+      // maximum allocation size or keep track of our own heap memory
+      // consumption, we will need to handle the allocations here that use the
+      // global operator new.
+      if (zero_initialize) {
+        data_.reset(new (std::nothrow) T[size_]());
+      } else {
+        data_.reset(new (std::nothrow) T[size_]);
+      }
+      if (data_ == nullptr) {
+        allocated_size_ = 0;
+        return false;
+      }
+      allocated_size_ = size_;
+    } else if (zero_initialize) {
+      // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess
+      // warning. The memset is safe because T is a trivial type.
+      void* dest = data_.get();
+      memset(dest, 0, sizeof(T) * size_);
+    }
+    data_view_.Reset(rows, columns, data_.get());
+    return true;
+  }
+
+  int rows() const { return data_view_.rows(); }
+  int columns() const { return data_view_.columns(); }
+  size_t size() const { return size_; }
+  T* data() { return data_.get(); }
+  const T* data() const { return data_.get(); }
+
+  T* operator[](int row) { return data_view_[row]; }
+
+  const T* operator[](int row) const { return data_view_[row]; }
+
+ private:
+  std::unique_ptr<T[]> data_;
+  size_t allocated_size_ = 0;
+  size_t size_ = 0;
+  Array2DView<T> data_view_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ARRAY_2D_H_
diff --git a/src/utils/array_2d_test.cc b/src/utils/array_2d_test.cc
new file mode 100644
index 0000000..0535274
--- /dev/null
+++ b/src/utils/array_2d_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/array_2d.h"
+
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kRows = 50;
+constexpr int kColumns = 200;
+
+TEST(Array2dViewTest, TestUint8) {
+  uint8_t data[kRows * kColumns] = {};
+  Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint16) {
+  uint16_t data[kRows * kColumns] = {};
+  Array2DView<uint16_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dViewTest, TestUint8Const) {
+  uint8_t data[kRows * kColumns] = {};
+  // Declared as const to provide a read-only view of |data|.
+  const Array2DView<uint8_t> data2d(kRows, kColumns, data);
+
+  // Verify data.
+  data[kColumns] = 100;
+  data[kColumns + 1] = 101;
+  data[kColumns * 2 + 10] = 210;
+  data[kColumns * 2 + 40] = 240;
+  EXPECT_EQ(data2d[1][0], 100);
+  EXPECT_EQ(data2d[1][1], 101);
+  EXPECT_EQ(data2d[2][10], 210);
+  EXPECT_EQ(data2d[2][40], 240);
+
+  // Verify pointers.
+  EXPECT_EQ(data2d[10], data + 10 * kColumns);
+}
+
+TEST(Array2dTest, TestUint8) {
+  Array2D<uint8_t> data2d;
+  ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+  EXPECT_EQ(data2d.rows(), kRows);
+  EXPECT_EQ(data2d.columns(), kColumns);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero initialized).
+  for (int i = 0; i < kRows; ++i) {
+    for (int j = 0; j < kColumns; ++j) {
+      EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+    }
+  }
+
+  // Reset to a 2d array of smaller size with zero_initialize == false.
+  data2d[0][0] = 10;
+  ASSERT_TRUE(data2d.Reset(kRows - 1, kColumns - 1, false));
+
+  EXPECT_EQ(data2d.rows(), kRows - 1);
+  EXPECT_EQ(data2d.columns(), kColumns - 1);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows - 1; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero except for 0,0 because it was zero initialized in
+  // the previous call to Reset).
+  for (int i = 0; i < kRows - 1; ++i) {
+    for (int j = 0; j < kColumns - 1; ++j) {
+      if (i == 0 && j == 0) {
+        EXPECT_EQ(data2d[i][j], 10) << "Mismatch in [" << i << "][" << j << "]";
+      } else {
+        EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+      }
+    }
+  }
+
+  // Reset to a 2d array of smaller size with zero_initialize == true.
+  ASSERT_TRUE(data2d.Reset(kRows - 2, kColumns - 2, true));
+
+  EXPECT_EQ(data2d.rows(), kRows - 2);
+  EXPECT_EQ(data2d.columns(), kColumns - 2);
+
+  // Verify pointers.
+  for (int i = 0; i < kRows - 2; ++i) {
+    EXPECT_NE(data2d[i], nullptr);
+  }
+
+  // Verify data (must be zero initialized).
+  for (int i = 0; i < kRows - 2; ++i) {
+    for (int j = 0; j < kColumns - 2; ++j) {
+      EXPECT_EQ(data2d[i][j], 0) << "Mismatch in [" << i << "][" << j << "]";
+    }
+  }
+}
+
+TEST(Array2dTest, TestUniquePtr1) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+   private:
+    int* value_;
+  };
+  int value = 100;
+  Array2D<std::unique_ptr<Cleaner>> data2d;
+  ASSERT_TRUE(data2d.Reset(4, 4, true));
+  data2d[0][0].reset(new (std::nothrow) Cleaner(&value));
+  EXPECT_EQ(value, 100);
+  // Reset to a smaller size. Depending on the implementation, the data_ buffer
+  // may or may not be reused.
+  ASSERT_TRUE(data2d.Reset(2, 2, true));
+  // Reset to a much larger size. The data_ buffer will be reallocated.
+  ASSERT_TRUE(data2d.Reset(32, 32, true));
+  // The destructors of all elements in the former data_ buffer should have
+  // been invoked.
+  EXPECT_EQ(value, 0);
+}
+
+TEST(Array2dTest, TestUniquePtr2) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+   private:
+    int* value_;
+  };
+  int value1 = 100;
+  int value2 = 200;
+  Array2D<std::unique_ptr<Cleaner>> data2d;
+  ASSERT_TRUE(data2d.Reset(4, 4, false));
+  data2d[0][0].reset(new (std::nothrow) Cleaner(&value1));
+  data2d[3][3].reset(new (std::nothrow) Cleaner(&value2));
+  EXPECT_EQ(value1, 100);
+  EXPECT_EQ(value2, 200);
+  // Reset to a smaller size. Whether or not the data_ buffer is reused, the
+  // destructors of all existing elements should be invoked.
+  ASSERT_TRUE(data2d.Reset(2, 2, false));
+  EXPECT_EQ(value1, 0);
+  EXPECT_EQ(value2, 0);
+}
+
+// Shows that std::is_standard_layout is not relevant to the default
+// initialization vs. value initialization issue, but std::is_trivial is.
+TEST(Array2dTest, TestStructInit) {
+  // Make one data member private so that this struct does not have a standard
+  // layout. This also makes the struct not a POD type.
+  struct Point {
+    int x;
+    int Y() const { return y; }
+
+   private:
+    int y;
+  };
+
+  EXPECT_TRUE(std::is_trivial<Point>::value);
+  EXPECT_FALSE(std::is_standard_layout<Point>::value);
+
+  // The Point structs in this array are default initialized.
+  Array2D<Point> data2d_default_init;
+  ASSERT_TRUE(data2d_default_init.Reset(kRows, kColumns, false));
+  // The Point structs in this array are value initialized (i.e., zero
+  // initialized).
+  Array2D<Point> data2d;
+  ASSERT_TRUE(data2d.Reset(kRows, kColumns, true));
+
+#if LIBGAV1_MSAN
+  // Use MemorySanitizer to check Reset(rows, columns, false) does not
+  // initialize the memory while Reset(rows, columns, true) does.
+  //
+  // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+  // first (at least partially) poisoned byte in the range, or -1 if the whole
+  // range is good.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(__msan_test_shadow(data2d_default_init[i],
+                                 sizeof(data2d_default_init[0][0]) * kColumns),
+              0);
+    EXPECT_EQ(__msan_test_shadow(data2d[i], sizeof(data2d[0][0]) * kColumns),
+              -1);
+    for (int j = 0; j < kColumns; ++j) {
+      EXPECT_EQ(data2d[i][j].x, 0);
+      EXPECT_EQ(data2d[i][j].Y(), 0);
+    }
+  }
+#endif
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h
new file mode 100644
index 0000000..7371753
--- /dev/null
+++ b/src/utils/bit_mask_set.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+// This class is used to check if a given value is equal to one of the several
+// predetermined values using a bit mask instead of a chain of comparisons and
+// ||s. This usually results in fewer instructions.
+//
+// Usage:
+//   constexpr BitMaskSet set(value1, value2);
+//   set.Contains(value1) => returns true.
+//   set.Contains(value3) => returns false.
+class BitMaskSet {
+ public:
+  explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {}
+
+  constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {}
+
+  constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7,
+                       int v8, int v9, int v10)
+      : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) |
+              (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) {
+  }
+
+  constexpr bool Contains(uint8_t value) const {
+    return MaskContainsValue(mask_, value);
+  }
+
+  static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) {
+    return ((mask >> value) & 1) != 0;
+  }
+
+ private:
+  const uint32_t mask_;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_
diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc
new file mode 100644
index 0000000..3234128
--- /dev/null
+++ b/src/utils/bit_reader.cc
@@ -0,0 +1,117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/bit_reader.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "src/utils/common.h"
+
+namespace libgav1 {
+namespace {
+
+bool Assign(int* const value, int assignment, bool return_value) {
+  *value = assignment;
+  return return_value;
+}
+
+// 5.9.29.
+int InverseRecenter(int r, int v) {
+  if (v > (r << 1)) {
+    return v;
+  }
+  if ((v & 1) != 0) {
+    return r - ((v + 1) >> 1);
+  }
+  return r + (v >> 1);
+}
+
+}  // namespace
+
+bool BitReader::DecodeSignedSubexpWithReference(int low, int high,
+                                                int reference, int control,
+                                                int* const value) {
+  if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control,
+                                         value)) {
+    return false;
+  }
+  *value += low;
+  return true;
+}
+
+bool BitReader::DecodeUniform(int n, int* const value) {
+  if (n <= 1) {
+    return Assign(value, 0, true);
+  }
+  const int w = FloorLog2(n) + 1;
+  const int m = (1 << w) - n;
+  assert(w - 1 < 32);
+  const int v = static_cast<int>(ReadLiteral(w - 1));
+  if (v == -1) {
+    return Assign(value, 0, false);
+  }
+  if (v < m) {
+    return Assign(value, v, true);
+  }
+  const int extra_bit = ReadBit();
+  if (extra_bit == -1) {
+    return Assign(value, 0, false);
+  }
+  return Assign(value, (v << 1) - m + extra_bit, true);
+}
+
+bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference,
+                                                  int control,
+                                                  int* const value) {
+  int v;
+  if (!DecodeSubexp(mx, control, &v)) return false;
+  if ((reference << 1) <= mx) {
+    *value = InverseRecenter(reference, v);
+  } else {
+    *value = mx - 1 - InverseRecenter(mx - 1 - reference, v);
+  }
+  return true;
+}
+
+bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) {
+  int i = 0;
+  int mk = 0;
+  while (true) {
+    const int b = (i != 0) ? control + i - 1 : control;
+    if (b >= 32) {
+      return Assign(value, 0, false);
+    }
+    const int a = 1 << b;
+    if (num_symbols <= mk + 3 * a) {
+      if (!DecodeUniform(num_symbols - mk, value)) return false;
+      *value += mk;
+      return true;
+    }
+    const int8_t subexp_more_bits = ReadBit();
+    if (subexp_more_bits == -1) return false;
+    if (subexp_more_bits != 0) {
+      ++i;
+      mk += a;
+    } else {
+      const int subexp_bits = static_cast<int>(ReadLiteral(b));
+      if (subexp_bits == -1) {
+        return Assign(value, 0, false);
+      }
+      return Assign(value, subexp_bits + mk, true);
+    }
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h
new file mode 100644
index 0000000..5a10e12
--- /dev/null
+++ b/src/utils/bit_reader.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_BIT_READER_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+class BitReader {
+ public:
+  virtual ~BitReader() = default;
+
+  virtual int ReadBit() = 0;
+  // |num_bits| has to be <= 32. The function returns a value in the range [0,
+  // 2^num_bits - 1] (inclusive) on success and -1 on failure.
+  virtual int64_t ReadLiteral(int num_bits) = 0;
+
+  bool DecodeSignedSubexpWithReference(int low, int high, int reference,
+                                       int control, int* value);  // 5.9.26.
+  // Decodes a nonnegative integer with maximum number of values |n| (i.e.,
+  // output in range 0..n-1) by following the process specified in Section
+  // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec.
+  bool DecodeUniform(int n, int* value);
+
+ private:
+  // Helper functions for DecodeSignedSubexpWithReference.
+  bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control,
+                                         int* value);           // 5.9.27.
+  bool DecodeSubexp(int num_symbols, int control, int* value);  // 5.9.28.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BIT_READER_H_
diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc
new file mode 100644
index 0000000..3bb9f1e
--- /dev/null
+++ b/src/utils/block_parameters_holder.cc
@@ -0,0 +1,83 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include <algorithm>
+
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  index_ = 0;
+  return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+         block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+                                            BlockSize block_size) {
+  const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+  if (index >= block_parameters_.size()) return nullptr;
+  auto& bp = block_parameters_.get()[index];
+  if (bp == nullptr) {
+    bp.reset(new (std::nothrow) BlockParameters);
+    if (bp == nullptr) return nullptr;
+  }
+  FillCache(row4x4, column4x4, block_size, bp.get());
+  return bp.get();
+}
+
+void BlockParametersHolder::FillCache(int row4x4, int column4x4,
+                                      BlockSize block_size,
+                                      BlockParameters* const bp) {
+  int rows = std::min(static_cast<int>(kNum4x4BlocksHigh[block_size]),
+                      rows4x4_ - row4x4);
+  const int columns = std::min(static_cast<int>(kNum4x4BlocksWide[block_size]),
+                               columns4x4_ - column4x4);
+  auto* bp_dst = &block_parameters_cache_[row4x4][column4x4];
+  // Specialize columns cases (values in kNum4x4BlocksWide[]) for better
+  // performance.
+  if (columns == 1) {
+    SetBlock<BlockParameters*>(rows, 1, bp, bp_dst, columns4x4_);
+  } else if (columns == 2) {
+    SetBlock<BlockParameters*>(rows, 2, bp, bp_dst, columns4x4_);
+  } else if (columns == 4) {
+    SetBlock<BlockParameters*>(rows, 4, bp, bp_dst, columns4x4_);
+  } else if (columns == 8) {
+    SetBlock<BlockParameters*>(rows, 8, bp, bp_dst, columns4x4_);
+  } else if (columns == 16) {
+    SetBlock<BlockParameters*>(rows, 16, bp, bp_dst, columns4x4_);
+  } else if (columns == 32) {
+    SetBlock<BlockParameters*>(rows, 32, bp, bp_dst, columns4x4_);
+  } else {
+    do {
+      // The following loop has better performance than using std::fill().
+      // std::fill() has some overhead in checking zero loop count.
+      int x = columns;
+      auto* d = bp_dst;
+      do {
+        *d++ = bp;
+      } while (--x != 0);
+      bp_dst += columns4x4_;
+    } while (--rows != 0);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h
new file mode 100644
index 0000000..ca36907
--- /dev/null
+++ b/src/utils/block_parameters_holder.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+
+#include <atomic>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
+class BlockParametersHolder {
+ public:
+  BlockParametersHolder() = default;
+
+  // Not copyable or movable.
+  BlockParametersHolder(const BlockParametersHolder&) = delete;
+  BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
+
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+  // Returns a pointer to a BlockParameters object that can be used safely until
+  // the next call to Reset(). Returns nullptr on memory allocation failure. It
+  // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+  // of size |block_size| with the returned pointer.
+  BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
+
+  // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
+  // is done as a simple look up of the |block_parameters_cache_| matrix.
+  // Returns nullptr if the BlockParameters cannot be found.
+  BlockParameters* Find(int row4x4, int column4x4) const {
+    return block_parameters_cache_[row4x4][column4x4];
+  }
+
+  BlockParameters** Address(int row4x4, int column4x4) {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  BlockParameters* const* Address(int row4x4, int column4x4) const {
+    return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4;
+  }
+
+  int columns4x4() const { return columns4x4_; }
+
+ private:
+  // Needs access to FillCache for testing Cdef.
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterApplyCdefTest;
+
+  void FillCache(int row4x4, int column4x4, BlockSize block_size,
+                 BlockParameters* bp);
+
+  int rows4x4_ = 0;
+  int columns4x4_ = 0;
+
+  // Owns the memory of BlockParameters pointers for the entire frame. It can
+  // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+  // on demand and re-used across frames.
+  DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+  // Points to the next available index of |block_parameters_|.
+  std::atomic<int> index_;
+
+  // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
+  // FillCache() and used by Find() to perform look ups using exactly one look
+  // up (instead of traversing the entire tree).
+  Array2D<BlockParameters*> block_parameters_cache_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
diff --git a/src/utils/block_parameters_holder_test.cc b/src/utils/block_parameters_holder_test.cc
new file mode 100644
index 0000000..212eba5
--- /dev/null
+++ b/src/utils/block_parameters_holder_test.cc
@@ -0,0 +1,76 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/block_parameters_holder.h"
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(BlockParametersHolder, TestBasic) {
+  BlockParametersHolder holder;
+  ASSERT_TRUE(holder.Reset(20, 20));
+
+  // Get a BlockParameters object.
+  BlockParameters* const bp1 = holder.Get(10, 10, kBlock32x32);
+  ASSERT_NE(bp1, nullptr);
+  // Ensure that cache was filled appropriately. From (10, 10) to (17, 17)
+  // should be bp1 (10 + 4x4 width/height of 32x32 block is 18).
+  for (int i = 10; i < 18; ++i) {
+    for (int j = 10; j < 18; ++j) {
+      EXPECT_EQ(holder.Find(i, j), bp1)
+          << "Mismatch in (" << i << ", " << j << ")";
+    }
+  }
+
+  // Get the maximum number of BlockParameters objects.
+  for (int i = 0; i < 399; ++i) {
+    EXPECT_NE(holder.Get(10, 10, kBlock32x32), nullptr)
+        << "Mismatch in index " << i;
+  }
+
+  // Get() should now return nullptr since there are no more BlockParameters
+  // objects available.
+  EXPECT_EQ(holder.Get(10, 10, kBlock32x32), nullptr);
+
+  // Reset the holder to the same size.
+  ASSERT_TRUE(holder.Reset(20, 20));
+
+  // Get a BlockParameters object. This should be the same as bp1 since the
+  // holder was Reset to the same size.
+  BlockParameters* const bp2 = holder.Get(10, 10, kBlock32x32);
+  EXPECT_EQ(bp2, bp1);
+
+  // Reset the holder to a smaller size.
+  ASSERT_TRUE(holder.Reset(20, 10));
+
+  // Get a BlockParameters object. This should be the same as bp1 since the
+  // holder was Reset to a smaller size.
+  BlockParameters* const bp3 = holder.Get(0, 0, kBlock32x32);
+  EXPECT_EQ(bp3, bp1);
+
+  // Reset the holder to a larger size.
+  ASSERT_TRUE(holder.Reset(30, 30));
+
+  // Get a BlockParameters object. This may or may not be the same as bp1 since
+  // the holder was Reset to a larger size.
+  BlockParameters* const bp4 = holder.Get(0, 0, kBlock32x32);
+  EXPECT_NE(bp4, nullptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h
new file mode 100644
index 0000000..6d664f8
--- /dev/null
+++ b/src/utils/blocking_counter.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
+
+#include <cassert>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// Implementation of a Blocking Counter that is used for the "fork-join"
+// use case. Typical usage would be as follows:
+//   BlockingCounter counter(num_jobs);
+//     - spawn the jobs.
+//     - call counter.Wait() on the master thread.
+//     - worker threads will call counter.Decrement().
+//     - master thread will return from counter.Wait() when all workers are
+//     complete.
+template <bool has_failure_status>
+class BlockingCounterImpl {
+ public:
+  explicit BlockingCounterImpl(int initial_count)
+      : count_(initial_count), job_failed_(false) {}
+
+  // Increment the counter by |count|. This must be called before Wait() is
+  // called. This must be called from the same thread that will call Wait().
+  void IncrementBy(int count) {
+    assert(count >= 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    count_ += count;
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is false (i.e.) when this class is being used with the
+  // |BlockingCounter| alias.
+  void Decrement() {
+    static_assert(!has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Decrement the counter by 1. This function can be called only when
+  // |has_failure_status| is true (i.e.) when this class is being used with the
+  // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the
+  // state of |job_failed_|.
+  void Decrement(bool job_succeeded) {
+    static_assert(has_failure_status, "");
+    std::unique_lock<std::mutex> lock(mutex_);
+    job_failed_ |= !job_succeeded;
+    if (--count_ == 0) {
+      condition_.notify_one();
+    }
+  }
+
+  // Block until the counter becomes 0. This function can be called only once
+  // per object. If |has_failure_status| is true, true is returned if all the
+  // jobs succeeded and false is returned if any of the jobs failed. If
+  // |has_failure_status| is false, this function always returns true.
+  bool Wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    condition_.wait(lock, [this]() { return count_ == 0; });
+    // If |has_failure_status| is false, we simply return true.
+    return has_failure_status ? !job_failed_ : true;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  int count_ LIBGAV1_GUARDED_BY(mutex_);
+  bool job_failed_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+using BlockingCounterWithStatus = BlockingCounterImpl<true>;
+using BlockingCounter = BlockingCounterImpl<false>;
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_
diff --git a/src/utils/blocking_counter_test.cc b/src/utils/blocking_counter_test.cc
new file mode 100644
index 0000000..1b6e7f5
--- /dev/null
+++ b/src/utils/blocking_counter_test.cc
@@ -0,0 +1,127 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/blocking_counter.h"
+
+#include <array>
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/threadpool.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kNumWorkers = 10;
+constexpr int kNumJobs = 20;
+
+TEST(BlockingCounterTest, BasicFunctionality) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounter counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement();
+    });
+  }
+
+  // Wait for the jobs to complete. This should always return true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterTest, IncrementBy) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounter counter(0);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    counter.IncrementBy(1);
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement();
+    });
+  }
+
+  // Wait for the jobs to complete. This should always return true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionality) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounterWithStatus counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement(true);
+    });
+  }
+
+  // Wait for the jobs to complete. This should return true since all the jobs
+  // reported |job_succeeded| as true.
+  ASSERT_TRUE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+TEST(BlockingCounterWithStatusTest, BasicFunctionalityWithStatus) {
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(kNumWorkers);
+  BlockingCounterWithStatus counter(kNumJobs);
+  std::array<bool, kNumJobs> done = {};
+
+  // Schedule the jobs.
+  for (int i = 0; i < kNumJobs; ++i) {
+    pool->Schedule([&counter, &done, i]() {
+      absl::SleepFor(absl::Seconds(1));
+      done[i] = true;
+      counter.Decrement(i != 10);
+    });
+  }
+
+  // Wait for the jobs to complete. This should return false since one of the
+  // jobs reported |job_succeeded| as false.
+  ASSERT_FALSE(counter.Wait());
+
+  // Make sure the jobs were actually complete.
+  for (const auto& job_done : done) {
+    EXPECT_TRUE(job_done);
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/common.h b/src/utils/common.h
new file mode 100644
index 0000000..f75ace8
--- /dev/null
+++ b/src/utils/common.h
@@ -0,0 +1,555 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMMON_H_
+#define LIBGAV1_SRC_UTILS_COMMON_H_
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#if defined(_M_X64) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64)
+#define HAVE_BITSCANREVERSE64
+#endif  // defined(_M_X64) || defined(_M_ARM64)
+#endif  // defined(_MSC_VER)
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+
+#include "src/utils/bit_mask_set.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// LIBGAV1_RESTRICT
+// Declares a pointer with the restrict type qualifier if available.
+// This allows code to hint to the compiler that only this pointer references a
+// particular object or memory region within the scope of the block in which it
+// is declared. This may allow for improved optimizations due to the lack of
+// pointer aliasing. See also:
+// https://en.cppreference.com/w/c/language/restrict
+// Note a template alias is not used for compatibility with older compilers
+// (e.g., gcc < 10) that do not expand the type when instantiating a template
+// function, either explicitly or in an assignment to a function pointer as is
+// done within the dsp code. RestrictPtr<T>::type is an alternative to this,
+// similar to std::add_const, but for conciseness the macro is preferred.
+#ifdef __GNUC__
+#define LIBGAV1_RESTRICT __restrict__
+#elif defined(_MSC_VER)
+#define LIBGAV1_RESTRICT __restrict
+#else
+#define LIBGAV1_RESTRICT
+#endif
+
+// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2.
+template <typename T>
+inline T Align(T value, T alignment) {
+  assert(alignment != 0);
+  const T alignment_mask = alignment - 1;
+  return (value + alignment_mask) & ~alignment_mask;
+}
+
+// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2.
+inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) {
+  const auto value = reinterpret_cast<uintptr_t>(addr);
+  return reinterpret_cast<uint8_t*>(Align(value, alignment));
+}
+
+inline int32_t Clip3(int32_t value, int32_t low, int32_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+                const int right) {
+  auto* const start = static_cast<Pixel*>(line_start);
+  const Pixel* src = start;
+  Pixel* dst = start - left;
+  // Copy to left and right borders.
+  Memset(dst, src[0], left);
+  Memset(dst + left + width, src[width - 1], right);
+}
+
+// The following 2 templates set a block of data with uncontiguous memory to
+// |value|. The compilers usually generate several branches to handle different
+// cases of |columns| when inlining memset() and std::fill(), and these branches
+// are unfortunately within the loop of |rows|. So calling these templates
+// directly could be inefficient. It is recommended to specialize common cases
+// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before
+// processing the generic case of |columns|. The code size may be larger, but
+// there would be big speed gains.
+// Call template MemSetBlock<> when sizeof(|T|) is 1.
+// Call template SetBlock<> when sizeof(|T|) is larger than 1.
+template <typename T>
+void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  static_assert(sizeof(T) == 1, "");
+  do {
+    memset(dst, value, columns);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+template <typename T>
+void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
+  do {
+    std::fill(dst, dst + columns, value);
+    dst += stride;
+  } while (--rows != 0);
+}
+
+#if defined(__GNUC__)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_clz(n);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  return __builtin_clzll(n);
+}
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
+#elif defined(_MSC_VER)
+
+inline int CountLeadingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 31 ^ static_cast<int>(first_set_bit);
+}
+
+inline int CountLeadingZeros(uint64_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+#if defined(HAVE_BITSCANREVERSE64)
+  const unsigned char bit_set =
+      _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
+#else   // !defined(HAVE_BITSCANREVERSE64)
+  const auto n_hi = static_cast<unsigned long>(n >> 32);  // NOLINT(runtime/int)
+  if (n_hi != 0) {
+    const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
+    assert(bit_set != 0);
+    static_cast<void>(bit_set);
+    return 31 ^ static_cast<int>(first_set_bit);
+  }
+  const unsigned char bit_set = _BitScanReverse(
+      &first_set_bit, static_cast<unsigned long>(n));  // NOLINT(runtime/int)
+#endif  // defined(HAVE_BITSCANREVERSE64)
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return 63 ^ static_cast<int>(first_set_bit);
+}
+
+#undef HAVE_BITSCANREVERSE64
+
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  const unsigned char bit_set = _BitScanForward(&first_set_bit, n);
+  assert(bit_set != 0);
+  static_cast<void>(bit_set);
+  return static_cast<int>(first_set_bit);
+}
+
+#else  // !defined(__GNUC__) && !defined(_MSC_VER)
+
+template <const int kMSB, typename T>
+inline int CountLeadingZeros(T n) {
+  assert(n != 0);
+  const T msb = T{1} << kMSB;
+  int count = 0;
+  while ((n & msb) == 0) {
+    ++count;
+    n <<= 1;
+  }
+  return count;
+}
+
+inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); }
+
+inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); }
+
+// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second
+// Edition, page 109. The book says:
+//   If the number of trailing 0's is expected to be small or large, then the
+//   simple loops shown in Figure 5-23 are quite fast.
+inline int CountTrailingZeros(uint32_t n) {
+  assert(n != 0);
+  // Create a word with 1's at the positions of the trailing 0's in |n|, and
+  // 0's elsewhere (e.g., 01011000 => 00000111).
+  n = ~n & (n - 1);
+  int count = 0;
+  while (n != 0) {
+    ++count;
+    n >>= 1;
+  }
+  return count;
+}
+
+#endif  // defined(__GNUC__)
+
+inline int FloorLog2(int32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
+}
+
+inline int FloorLog2(uint32_t n) {
+  assert(n > 0);
+  return 31 ^ CountLeadingZeros(n);
+}
+
+inline int FloorLog2(int64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
+}
+
+inline int FloorLog2(uint64_t n) {
+  assert(n > 0);
+  return 63 ^ CountLeadingZeros(n);
+}
+
+inline int CeilLog2(unsigned int n) {
+  // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but
+  // also for n == 1, so this expression must be guarded by the n < 2 test. An
+  // alternative implementation is:
+  // return (n == 0) ? 0 : FloorLog2(n) + static_cast<int>((n & (n - 1)) != 0);
+  return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
+}
+
+inline int RightShiftWithCeiling(int value, int bits) {
+  assert(bits > 0);
+  return (value + (1 << bits) - 1) >> bits;
+}
+
+inline int32_t RightShiftWithRounding(int32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+inline uint32_t RightShiftWithRounding(uint32_t value, int bits) {
+  assert(bits >= 0);
+  return (value + ((1 << bits) >> 1)) >> bits;
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRounding(int64_t value, int bits) {
+  assert(bits >= 0);
+  return static_cast<int32_t>((value + ((int64_t{1} << bits) >> 1)) >> bits);
+}
+
+inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 31), bits);
+}
+
+// This variant is used when |value| can exceed 32 bits. Although the final
+// result must always fit into int32_t.
+inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) {
+  assert(bits > 0);
+  // The next line is equivalent to:
+  // return (value >= 0) ? RightShiftWithRounding(value, bits)
+  //                     : -RightShiftWithRounding(-value, bits);
+  return RightShiftWithRounding(value + (value >> 63), bits);
+}
+
+constexpr int DivideBy2(int n) { return n >> 1; }
+constexpr int DivideBy4(int n) { return n >> 2; }
+constexpr int DivideBy8(int n) { return n >> 3; }
+constexpr int DivideBy16(int n) { return n >> 4; }
+constexpr int DivideBy32(int n) { return n >> 5; }
+constexpr int DivideBy64(int n) { return n >> 6; }
+constexpr int DivideBy128(int n) { return n >> 7; }
+
+// Convert |value| to unsigned before shifting to avoid undefined behavior with
+// negative values.
+inline int LeftShift(int value, int bits) {
+  assert(bits >= 0);
+  assert(value >= -(int64_t{1} << (31 - bits)));
+  assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0));
+  return static_cast<int>(static_cast<uint32_t>(value) << bits);
+}
+inline int MultiplyBy2(int n) { return LeftShift(n, 1); }
+inline int MultiplyBy4(int n) { return LeftShift(n, 2); }
+inline int MultiplyBy8(int n) { return LeftShift(n, 3); }
+inline int MultiplyBy16(int n) { return LeftShift(n, 4); }
+inline int MultiplyBy32(int n) { return LeftShift(n, 5); }
+inline int MultiplyBy64(int n) { return LeftShift(n, 6); }
+
+constexpr int Mod32(int n) { return n & 0x1f; }
+constexpr int Mod64(int n) { return n & 0x3f; }
+
+//------------------------------------------------------------------------------
+// Bitstream functions
+
+constexpr bool IsIntraFrame(FrameType type) {
+  return type == kFrameKey || type == kFrameIntraOnly;
+}
+
+inline TransformClass GetTransformClass(TransformType tx_type) {
+  constexpr BitMaskSet kTransformClassVerticalMask(
+      kTransformTypeIdentityDct, kTransformTypeIdentityAdst,
+      kTransformTypeIdentityFlipadst);
+  if (kTransformClassVerticalMask.Contains(tx_type)) {
+    return kTransformClassVertical;
+  }
+  constexpr BitMaskSet kTransformClassHorizontalMask(
+      kTransformTypeDctIdentity, kTransformTypeAdstIdentity,
+      kTransformTypeFlipadstIdentity);
+  if (kTransformClassHorizontalMask.Contains(tx_type)) {
+    return kTransformClassHorizontal;
+  }
+  return kTransformClass2D;
+}
+
+inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane,
+                                 int8_t subsampling) {
+  return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling);
+}
+
+constexpr PlaneType GetPlaneType(Plane plane) {
+  return static_cast<PlaneType>(plane != kPlaneY);
+}
+
+// 5.11.44.
+constexpr bool IsDirectionalMode(PredictionMode mode) {
+  return mode >= kPredictionModeVertical && mode <= kPredictionModeD67;
+}
+
+// 5.9.3.
+//
+// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit
+// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32.
+// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a
+// value between 24 and 31 (inclusive).
+//
+// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the
+// result is zero. If |order_hint_shift_bits| is not zero, returns the
+// signed difference |a| - |b| using "modular arithmetic". More precisely, the
+// signed difference |a| - |b| is treated as a signed order_hint_bits-bit
+// integer and cast to an int. The returned difference is between
+// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1
+// (inclusive).
+//
+// NOTE: |a| and |b| are the order_hint_bits least significant bits of the
+// actual values. This function returns the signed difference between the
+// actual values. The returned difference is correct as long as the actual
+// values are not more than 1 << (order_hint_bits - 1) - 1 apart.
+//
+// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits|
+// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for
+// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and
+// |b| are exactly 8 apart, this function cannot tell whether the actual value
+// for |a| is before or after the actual value for |b|.)
+//
+// First, consider the order hints 2 and 6. For this simple case, we have
+//   GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and
+//   GetRelativeDistance(6, 2, 28) = 6 - 2 = 4.
+//
+// On the other hand, consider the order hints 2 and 14. The order hints are
+// 12 (> 7) apart, so we need to use the actual values instead. The actual
+// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore
+// we have
+//   GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and
+//   GetRelativeDistance(14, 2, 28) = 30 - 34 = -4.
+//
+// The following comments apply only to specific CPUs' SIMD implementations,
+// such as intrinsics code.
+// For the 2 shift operations in this function, if the SIMD packed data is
+// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to
+// shift; If the SIMD packed data is 8-bit wide, try to use
+// |order_hint_shift_bits| - 24 as as the number of bits to shift.
+// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or
+// -24. In these cases diff is 0, and the behavior of left or right shifting -16
+// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions,
+// and the result of shifting 0 is still 0. There is no guarantee that this
+// behavior and result apply to other CPUs' SIMD instructions.
+inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
+                               const unsigned int order_hint_shift_bits) {
+  const int diff = static_cast<int>(a) - static_cast<int>(b);
+  assert(order_hint_shift_bits <= 31);
+  if (order_hint_shift_bits == 0) {
+    assert(a == 0);
+    assert(b == 0);
+  } else {
+    assert(order_hint_shift_bits >= 24);  // i.e., order_hint_bits <= 8
+    assert(a < (1u << (32 - order_hint_shift_bits)));
+    assert(b < (1u << (32 - order_hint_shift_bits)));
+    assert(diff < (1 << (32 - order_hint_shift_bits)));
+    assert(diff >= -(1 << (32 - order_hint_shift_bits)));
+  }
+  // Sign extend the result of subtracting the values.
+  // Cast to unsigned int and then left shift to avoid undefined behavior with
+  // negative values. Cast to int to do the sign extension through right shift.
+  // This requires the right shift of a signed integer be an arithmetic shift,
+  // which is true for clang, gcc, and Visual C++.
+  // These two casts do not generate extra instructions.
+  // Don't use LeftShift(diff) since a valid diff may fail its assertions.
+  // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less
+  // than the minimum allowed value of LeftShift() which is -8.
+  // The next 3 lines are equivalent to:
+  // const int order_hint_bits = Mod32(32 - order_hint_shift_bits);
+  // const int m = (1 << order_hint_bits) >> 1;
+  // return (diff & (m - 1)) - (diff & m);
+  return static_cast<int>(static_cast<unsigned int>(diff)
+                          << order_hint_shift_bits) >>
+         order_hint_shift_bits;
+}
+
+// Applies |sign| (must be 0 or -1) to |value|, i.e.,
+//   return (sign == 0) ? value : -value;
+// and does so without a branch.
+constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; }
+
+// 7.9.3. (without the clamp for numerator and denominator).
+inline void GetMvProjection(const MotionVector& mv, int numerator,
+                            int division_multiplier,
+                            MotionVector* projection_mv) {
+  // Allow numerator and to be 0 so that this function can be called
+  // unconditionally. When numerator is 0, |projection_mv| will be 0, and this
+  // is what we want.
+  assert(std::abs(numerator) <= kMaxFrameDistance);
+  for (int i = 0; i < 2; ++i) {
+    projection_mv->mv[i] =
+        Clip3(RightShiftWithRoundingSigned(
+                  mv.mv[i] * numerator * division_multiplier, 14),
+              -kProjectionMvClamp, kProjectionMvClamp);
+  }
+}
+
+// 7.9.4.
+constexpr int Project(int value, int delta, int dst_sign) {
+  return value + ApplySign(delta / 64, dst_sign);
+}
+
+inline bool IsBlockSmallerThan8x8(BlockSize size) {
+  return size < kBlock8x8 && size != kBlock4x16;
+}
+
+// Returns true if the either the width or the height of the block is equal to
+// four.
+inline bool IsBlockDimension4(BlockSize size) {
+  return size < kBlock8x8 || size == kBlock16x4;
+}
+
+// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively.
+constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; }
+
+// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps
+// to 0, kTransformSize8x8 maps to 1 and so on.
+inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) {
+  assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]);
+
+  // The values of the square transform sizes happen to be in the right
+  // ranges, so we can just divide them by 4 to get the indexes.
+  static_assert(
+      std::is_unsigned<std::underlying_type<TransformSize>::type>::value, "");
+  static_assert(kTransformSize4x4 < 4, "");
+  static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, "");
+  static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, "");
+  static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, "");
+  static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, "");
+  return DivideBy4(tx_size);
+}
+
+// Gets the corresponding Y/U/V position, to set and get filter masks
+// in deblock filtering.
+// Returns luma_position if it's Y plane, whose subsampling must be 0.
+// Returns the odd position for U/V plane, if there is subsampling.
+constexpr int GetDeblockPosition(const int luma_position,
+                                 const int subsampling) {
+  return luma_position | subsampling;
+}
+
+// Returns the size of the residual buffer required to hold the residual values
+// for a block or frame of size |rows| by |columns| (taking into account
+// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the
+// number of bytes required to represent one residual value.
+inline size_t GetResidualBufferSize(const int rows, const int columns,
+                                    const int subsampling_x,
+                                    const int subsampling_y,
+                                    const size_t residual_size) {
+  // The subsampling multipliers are:
+  //   Both x and y are subsampled: 3 / 2.
+  //   Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2).
+  //   Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2).
+  // So we compute the final subsampling multiplier as follows:
+  //   multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2.
+  // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks
+  // when parsing quantized coefficients.
+  const int subsampling_multiplier_num =
+      2 + (4 >> subsampling_x >> subsampling_y);
+  const int number_elements =
+      (rows * columns * subsampling_multiplier_num) >> 1;
+  const int tx_padding = 32 * kResidualPaddingVertical;
+  return residual_size * (number_elements + tx_padding);
+}
+
+// This function is equivalent to:
+// std::min({kTransformWidthLog2[tx_size] - 2,
+//           kTransformWidthLog2[left_tx_size] - 2,
+//           2});
+constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth(
+    TransformSize tx_size, TransformSize left_tx_size) {
+  return static_cast<LoopFilterTransformSizeId>(
+      static_cast<int>(tx_size > kTransformSize4x16 &&
+                       left_tx_size > kTransformSize4x16) +
+      static_cast<int>(tx_size > kTransformSize8x32 &&
+                       left_tx_size > kTransformSize8x32));
+}
+
+// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve
+// filters.
+inline int GetFilterIndex(const int filter_index, const int length) {
+  if (length <= 4) {
+    if (filter_index == kInterpolationFilterEightTap ||
+        filter_index == kInterpolationFilterEightTapSharp) {
+      return 4;
+    }
+    if (filter_index == kInterpolationFilterEightTapSmooth) {
+      return 5;
+    }
+  }
+  return filter_index;
+}
+
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
+constexpr int SubsampledValue(int value, int subsampling) {
+  return (value + subsampling) >> subsampling;
+}
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_COMMON_H_
diff --git a/src/utils/common_test.cc b/src/utils/common_test.cc
new file mode 100644
index 0000000..fdb218d
--- /dev/null
+++ b/src/utils/common_test.cc
@@ -0,0 +1,604 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/common.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int BitLength(int64_t n) {
+  int count = 0;
+  while (n != 0) {
+    ++count;
+    n >>= 1;
+  }
+  return count;
+}
+
+TEST(CommonUtilsTest, Align) {
+  for (int i = 0; i <= 8; ++i) {
+    const int alignment = 1 << i;
+    SCOPED_TRACE("alignment: " + std::to_string(alignment));
+    EXPECT_EQ(Align(0, alignment), 0);
+    EXPECT_EQ(Align(1, alignment), alignment);
+    EXPECT_EQ(Align(alignment + 1, alignment), 2 * alignment);
+    if (i > 1) {
+      EXPECT_EQ(Align(alignment - 1, alignment), alignment);
+      EXPECT_EQ(Align(2 * alignment - 1, alignment), 2 * alignment);
+    }
+  }
+}
+
+TEST(CommonUtilsTest, AlignAddr) {
+  auto buf = MakeAlignedUniquePtr<uint8_t>(/*alignment=*/1024, 512);
+  ASSERT_NE(buf, nullptr);
+  auto* const bufptr = buf.get();
+  ASSERT_EQ(reinterpret_cast<uintptr_t>(bufptr) % 1024, 0);
+
+  for (int i = 0; i <= 8; ++i) {
+    const int alignment = 1 << i;
+    ASSERT_LE(alignment, 1024);
+    SCOPED_TRACE("alignment: " + std::to_string(alignment));
+    EXPECT_EQ(AlignAddr(nullptr, alignment), nullptr);
+    EXPECT_EQ(AlignAddr(bufptr, alignment), bufptr);
+    EXPECT_EQ(AlignAddr(bufptr + 1, alignment), bufptr + alignment);
+    EXPECT_EQ(AlignAddr(bufptr + alignment + 1, alignment),
+              bufptr + 2 * alignment);
+    if (i > 1) {
+      EXPECT_EQ(AlignAddr(bufptr + alignment - 1, alignment),
+                bufptr + alignment);
+      EXPECT_EQ(AlignAddr(bufptr + 2 * alignment - 1, alignment),
+                bufptr + 2 * alignment);
+    }
+  }
+}
+
+TEST(CommonUtilsTest, Clip3) {
+  // Value <= lower boundary.
+  EXPECT_EQ(Clip3(10, 20, 30), 20);
+  EXPECT_EQ(Clip3(20, 20, 30), 20);
+  // Value >= higher boundary.
+  EXPECT_EQ(Clip3(40, 20, 30), 30);
+  EXPECT_EQ(Clip3(30, 20, 30), 30);
+  // Value within boundary.
+  EXPECT_EQ(Clip3(25, 20, 30), 25);
+  // Clipping based on bitdepth (clamp between 0 and 2^bitdepth - 1). Make sure
+  // that the resulting values are always in the pixel range for the
+  // corresponding bitdepth.
+  static constexpr int bitdepths[] = {8, 10, 12};
+  static constexpr int pixels[] = {100, 500, 5000, -100, -500, -5000};
+  for (const auto& bitdepth : bitdepths) {
+    for (const auto& pixel : pixels) {
+      const int clipped_pixel = Clip3(pixel, 0, (1 << bitdepth) - 1);
+      EXPECT_GE(clipped_pixel, 0)
+          << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+      EXPECT_LE(clipped_pixel, (1 << bitdepth) - 1)
+          << "Clip3 mismatch for bitdepth: " << bitdepth << " pixel: " << pixel;
+    }
+  }
+}
+
+template <typename Pixel>
+void TestExtendLine(int width, const int left, int right, Pixel left_value,
+                    Pixel right_value) {
+  constexpr int size = 1000;
+  ASSERT_LE(width + left + right, size);
+  Pixel line[size];
+  Pixel* line_start = line + left;
+  line_start[0] = left_value;
+  line_start[width - 1] = right_value;
+  ExtendLine<Pixel>(line_start, width, left, right);
+  for (int x = 0; x < left; x++) {
+    EXPECT_EQ(left_value, line[x]) << "Left side mismatch at x: " << x;
+  }
+  for (int x = 0; x < right; x++) {
+    EXPECT_EQ(right_value, line[left + width + x])
+        << "Right side mismatch at x: " << x;
+  }
+}
+
+TEST(CommonUtilsTest, ExtendLine) {
+  TestExtendLine<uint8_t>(300, 0, 0, 31, 13);
+  TestExtendLine<uint8_t>(100, 10, 20, 31, 13);
+  TestExtendLine<uint8_t>(257, 31, 77, 59, 255);
+  TestExtendLine<uint16_t>(600, 0, 0, 1234, 4321);
+  TestExtendLine<uint16_t>(200, 55, 88, 12345, 54321);
+  TestExtendLine<uint16_t>(2, 99, 333, 257, 513);
+}
+
+template <typename T>
+void TestMemSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+  constexpr int size = 1000;
+  T block[size];
+  static_assert(sizeof(T) == 1, "");
+  ASSERT_LE(rows * stride, size);
+  ASSERT_LE(columns, stride);
+  MemSetBlock<T>(rows, columns, value, block, stride);
+  for (int y = 0; y < rows; y++) {
+    for (int x = 0; x < columns; x++) {
+      EXPECT_EQ(value, block[y * stride + x])
+          << "Mismatch at y: " << y << " x: " << x;
+    }
+  }
+}
+
+TEST(CommonUtilsTest, MemSetBlock) {
+  TestMemSetBlock<bool>(15, 28, 29, true);
+  TestMemSetBlock<bool>(17, 1, 24, false);
+  TestMemSetBlock<bool>(7, 2, 13, true);
+  TestMemSetBlock<int8_t>(35, 17, 19, 123);
+  TestMemSetBlock<uint8_t>(19, 16, 16, 234);
+}
+
+template <typename T>
+void TestSetBlock(int rows, int columns, ptrdiff_t stride, T value) {
+  constexpr int size = 1000;
+  T block[size];
+  ASSERT_LE(rows * stride, size);
+  ASSERT_LE(columns, stride);
+  SetBlock<T>(rows, columns, value, block, stride);
+  for (int y = 0; y < rows; y++) {
+    for (int x = 0; x < columns; x++) {
+      EXPECT_EQ(value, block[y * stride + x])
+          << "Mismatch at y: " << y << " x: " << x;
+    }
+  }
+}
+
+TEST(CommonUtilsTest, SetBlock) {
+  // Test 1-byte block set.
+  TestSetBlock<bool>(15, 28, 29, true);
+  TestSetBlock<bool>(17, 1, 24, false);
+  TestSetBlock<bool>(7, 2, 13, true);
+  TestSetBlock<int8_t>(35, 17, 19, 123);
+  TestSetBlock<uint8_t>(19, 16, 16, 234);
+  // Test 2-byte block set.
+  TestSetBlock<int16_t>(23, 27, 28, 1234);
+  TestSetBlock<uint16_t>(13, 39, 44, 4321);
+  // Test 4-byte block set.
+  TestSetBlock<int>(14, 7, 7, 12345);
+  TestSetBlock<int>(33, 4, 15, 54321);
+  // Test pointer block set.
+  int data;
+  TestSetBlock<int*>(23, 8, 25, &data);
+}
+
+TEST(CommonUtilsTest, CountTrailingZeros) {
+  EXPECT_EQ(CountTrailingZeros(0x1), 0);
+  EXPECT_EQ(CountTrailingZeros(0x3), 0);
+  EXPECT_EQ(CountTrailingZeros(0x7), 0);
+  EXPECT_EQ(CountTrailingZeros(0xF), 0);
+  EXPECT_EQ(CountTrailingZeros(0x2), 1);
+  EXPECT_EQ(CountTrailingZeros(0x6), 1);
+  EXPECT_EQ(CountTrailingZeros(0xE), 1);
+  EXPECT_EQ(CountTrailingZeros(0x4), 2);
+  EXPECT_EQ(CountTrailingZeros(0xC), 2);
+  EXPECT_EQ(CountTrailingZeros(0x8), 3);
+  EXPECT_EQ(CountTrailingZeros(0x10), 4);
+  EXPECT_EQ(CountTrailingZeros(0x30), 4);
+  EXPECT_EQ(CountTrailingZeros(0x70), 4);
+  EXPECT_EQ(CountTrailingZeros(0xF0), 4);
+  EXPECT_EQ(CountTrailingZeros(0x20), 5);
+  EXPECT_EQ(CountTrailingZeros(0x60), 5);
+  EXPECT_EQ(CountTrailingZeros(0xE0), 5);
+  EXPECT_EQ(CountTrailingZeros(0x40), 6);
+  EXPECT_EQ(CountTrailingZeros(0xC0), 6);
+  EXPECT_EQ(CountTrailingZeros(0x80), 7);
+  EXPECT_EQ(CountTrailingZeros(0x31), 0);
+  EXPECT_EQ(CountTrailingZeros(0x32), 1);
+  EXPECT_EQ(CountTrailingZeros(0x34), 2);
+  EXPECT_EQ(CountTrailingZeros(0x38), 3);
+  EXPECT_EQ(CountTrailingZeros(0x310), 4);
+  EXPECT_EQ(CountTrailingZeros(0x320), 5);
+  EXPECT_EQ(CountTrailingZeros(0x340), 6);
+  EXPECT_EQ(CountTrailingZeros(0x380), 7);
+}
+
+TEST(CommonUtilsTest, FloorLog2) {
+  // Powers of 2.
+  EXPECT_EQ(FloorLog2(1), 0);
+  EXPECT_EQ(FloorLog2(2), 1);
+  EXPECT_EQ(FloorLog2(8), 3);
+  EXPECT_EQ(FloorLog2(64), 6);
+  // Powers of 2 +/- 1.
+  EXPECT_EQ(FloorLog2(9), 3);
+  EXPECT_EQ(FloorLog2(15), 3);
+  EXPECT_EQ(FloorLog2(63), 5);
+  // Large value, smaller than 32 bit.
+  EXPECT_EQ(FloorLog2(0x7fffffff), 30);
+  EXPECT_EQ(FloorLog2(0x80000000), 31);
+  // Larger than 32 bit.
+  EXPECT_EQ(FloorLog2(uint64_t{0x7fffffffffffffff}), 62);
+  EXPECT_EQ(FloorLog2(uint64_t{0x8000000000000000}), 63);
+  EXPECT_EQ(FloorLog2(uint64_t{0xffffffffffffffff}), 63);
+}
+
+TEST(CommonUtilsTest, CeilLog2) {
+  // Even though log2(0) is -inf, here we explicitly define it to be 0.
+  EXPECT_EQ(CeilLog2(0), 0);
+  // Powers of 2.
+  EXPECT_EQ(CeilLog2(1), 0);
+  EXPECT_EQ(CeilLog2(2), 1);
+  EXPECT_EQ(CeilLog2(8), 3);
+  EXPECT_EQ(CeilLog2(64), 6);
+  // Powers of 2 +/- 1.
+  EXPECT_EQ(CeilLog2(9), 4);
+  EXPECT_EQ(CeilLog2(15), 4);
+  EXPECT_EQ(CeilLog2(63), 6);
+  // Large value.
+  EXPECT_EQ(CeilLog2(0x7fffffff), 31);
+}
+
+TEST(CommonUtilsTest, RightShiftWithCeiling) {
+  // Shift 1 bit.
+  EXPECT_EQ(RightShiftWithCeiling(1, 1), 1);
+  EXPECT_EQ(RightShiftWithCeiling(2, 1), 1);
+  EXPECT_EQ(RightShiftWithCeiling(3, 1), 2);
+  EXPECT_EQ(RightShiftWithCeiling(4, 1), 2);
+  EXPECT_EQ(RightShiftWithCeiling(5, 1), 3);
+  // Shift 2 bits.
+  EXPECT_EQ(RightShiftWithCeiling(1, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(2, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(3, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(4, 2), 1);
+  EXPECT_EQ(RightShiftWithCeiling(5, 2), 2);
+  // Shift 20 bits.
+  EXPECT_EQ(RightShiftWithCeiling(1, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 20) - 1, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling(1 << 20, 20), 1);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 20) + 1, 20), 2);
+  EXPECT_EQ(RightShiftWithCeiling((1 << 21) - 1, 20), 2);
+}
+
+template <typename Input, typename Output>
+void VerifyRightShiftWithRounding(const Input* const values,
+                                  const int* const bits,
+                                  const Output* const rounded_values,
+                                  size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    const Output rounded_value = RightShiftWithRounding(values[i], bits[i]);
+    EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+    // Rounding reduces the bit length by |bits[i]| - 1.
+    EXPECT_LE(BitLength(rounded_value), BitLength(values[i]) - (bits[i] - 1))
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt32) {
+  static constexpr int32_t values[] = {5, 203, 204, 255, 40000, 50000};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12};
+  static constexpr int32_t rounded_values[] = {5, 25, 26, 32, 10, 12};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<int32_t, int32_t>(values, bits, rounded_values,
+                                                 ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingUint32) {
+  static constexpr uint32_t values[] = {5,     203,   204,       255,
+                                        40000, 50000, 0x7fffffff};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20};
+  static constexpr uint32_t rounded_values[] = {5, 25, 26, 32, 10, 12, 2048};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<uint32_t, uint32_t>(values, bits, rounded_values,
+                                                   ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingInt64) {
+  static constexpr int64_t values[] = {5,     203,   204,        255,
+                                       40000, 50000, 0x7fffffff, 0x8fffffff};
+  static constexpr int bits[] = {0, 3, 3, 3, 12, 12, 20, 20};
+  static constexpr int32_t rounded_values[] = {5,  25, 26,   32,
+                                               10, 12, 2048, 2304};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRounding<int64_t, int32_t>(values, bits, rounded_values,
+                                                 ABSL_ARRAYSIZE(values));
+}
+
+template <typename Input>
+void VerifyRightShiftWithRoundingSigned(const Input* const values,
+                                        const int* const bits,
+                                        const int32_t* const rounded_values,
+                                        int count) {
+  for (int i = 0; i < count; ++i) {
+    int32_t rounded_value = RightShiftWithRoundingSigned(values[i], bits[i]);
+    EXPECT_EQ(rounded_value, rounded_values[i]) << "Mismatch at index " << i;
+    rounded_value = RightShiftWithRoundingSigned(-values[i], bits[i]);
+    EXPECT_EQ(rounded_value, -rounded_values[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt32) {
+  static constexpr int32_t values[] = {203, 204, 255, 40000, 50000};
+  static constexpr int bits[] = {3, 3, 3, 12, 12};
+  static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRoundingSigned<int32_t>(values, bits, rounded_values,
+                                              ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, RightShiftWithRoundingSignedInt64) {
+  static constexpr int64_t values[] = {203,   204,        255,       40000,
+                                       50000, 0x7fffffff, 0x8fffffff};
+  static constexpr int bits[] = {3, 3, 3, 12, 12, 20, 20};
+  static constexpr int32_t rounded_values[] = {25, 26, 32, 10, 12, 2048, 2304};
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(bits), "");
+  static_assert(ABSL_ARRAYSIZE(values) == ABSL_ARRAYSIZE(rounded_values), "");
+  VerifyRightShiftWithRoundingSigned<int64_t>(values, bits, rounded_values,
+                                              ABSL_ARRAYSIZE(values));
+}
+
+TEST(CommonUtilTest, GetResidualBufferSize) {
+  // No subsampling.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 0, 2),
+            /* 2*(64*64*3/1 + 32*4) = */ 24832);
+  // Only X is subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 0, 2),
+            /* 2*(64*64*2/1 + 32*4) = */ 16640);
+  // Only Y is subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 0, 1, 2),
+            /* 2*(64*64*2/1 + 32*4) = */ 16640);
+  // Both X and Y are subsampled.
+  EXPECT_EQ(GetResidualBufferSize(64, 64, 1, 1, 2),
+            /* 2*(64*64*3/2 + 32*4) = */ 12544);
+}
+
+//------------------------------------------------------------------------------
+// Tests for bitstream util functions
+
+TEST(BitstreamUtilTest, IsIntraFrame) {
+  EXPECT_TRUE(IsIntraFrame(kFrameKey));
+  EXPECT_TRUE(IsIntraFrame(kFrameIntraOnly));
+  EXPECT_FALSE(IsIntraFrame(kFrameInter));
+  EXPECT_FALSE(IsIntraFrame(kFrameSwitch));
+}
+
+TEST(BitstreamUtilTest, GetTransformClass) {
+  static constexpr TransformClass expected_classes[kNumTransformTypes] = {
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClass2D,       kTransformClass2D,
+      kTransformClassVertical, kTransformClassHorizontal,
+      kTransformClassVertical, kTransformClassHorizontal,
+      kTransformClassVertical, kTransformClassHorizontal,
+  };
+  for (int i = 0; i < kNumTransformTypes; ++i) {
+    EXPECT_EQ(GetTransformClass(static_cast<TransformType>(i)),
+              expected_classes[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtilTest, RowOrColumn4x4ToPixel) {
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneY, 1),
+            40);  // Subsampling should have no effect on Y plane.
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneU, 1), 20);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 0), 40);
+  EXPECT_EQ(RowOrColumn4x4ToPixel(10, kPlaneV, 1), 20);
+}
+
+TEST(BitstreamUtilTest, GetPlaneType) {
+  EXPECT_EQ(GetPlaneType(kPlaneY), kPlaneTypeY);
+  EXPECT_EQ(GetPlaneType(kPlaneU), kPlaneTypeUV);
+  EXPECT_EQ(GetPlaneType(kPlaneV), kPlaneTypeUV);
+}
+
+TEST(BitstreamUtils, IsDirectionalMode) {
+  static constexpr bool is_directional_modes[kNumPredictionModes] = {
+      false, true,  true,  true,  true,  true,  true,  true,  true,
+      false, false, false, false, false, false, false, false, false,
+      false, false, false, false, false, false, false, false,
+  };
+  for (int i = 0; i < kNumPredictionModes; ++i) {
+    EXPECT_EQ(IsDirectionalMode(static_cast<PredictionMode>(i)),
+              is_directional_modes[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtils, GetRelativeDistance) {
+  // Both order_hint_bits and order_hint_shift_bits are zero. (a and b must be
+  // zero.)
+  EXPECT_EQ(GetRelativeDistance(0, 0, 0), 0);
+  EXPECT_EQ(GetRelativeDistance(10, 20, 27), -10);
+
+  EXPECT_EQ(GetRelativeDistance(2, 1, 30), 1);
+  EXPECT_EQ(GetRelativeDistance(2, 1, 29), 1);
+
+  EXPECT_EQ(GetRelativeDistance(1, 2, 30), -1);
+  EXPECT_EQ(GetRelativeDistance(1, 2, 29), -1);
+
+  // With an order_hint_bits of 4 and an order_hint_shift_bits of 28, 16 is the
+  // same as 0, 17 is the same as 1, etc. The most positive distance is 7, and
+  // the most negative distance is -8.
+
+  EXPECT_EQ(GetRelativeDistance(2, 6, 28), -4);
+  EXPECT_EQ(GetRelativeDistance(6, 2, 28), 4);
+  // 18 - 14 = 4.
+  EXPECT_EQ(GetRelativeDistance(2, 14, 28), 4);
+  // 14 - 18 = -4.
+  EXPECT_EQ(GetRelativeDistance(14, 2, 28), -4);
+  // If a and b are exactly 8 apart, GetRelativeDistance() cannot tell whether
+  // a is before or after b. GetRelativeDistance(a, b) and
+  // GetRelativeDistance(b, a) are both -8.
+  // 1 - 9 = -8.
+  EXPECT_EQ(GetRelativeDistance(1, 9, 28), -8);
+  // 9 - 17 = -8.
+  EXPECT_EQ(GetRelativeDistance(9, 1, 28), -8);
+
+  // With an order_hint_bits of 5 and an order_hint_shift_bits of 27, 32 is the
+  // same as 0, 33 is the same as 1, etc. The most positive distance is 15, and
+  // the most negative distance is -16.
+
+  // 31 - 32 = -1.
+  EXPECT_EQ(GetRelativeDistance(31, 0, 27), -1);
+  // 32 - 31 = 1.
+  EXPECT_EQ(GetRelativeDistance(0, 31, 27), 1);
+  // 30 - 33 = -3.
+  EXPECT_EQ(GetRelativeDistance(30, 1, 27), -3);
+  // 33 - 30 = 3.
+  EXPECT_EQ(GetRelativeDistance(1, 30, 27), 3);
+  // 25 - 36 = -11.
+  EXPECT_EQ(GetRelativeDistance(25, 4, 27), -11);
+  // 36 - 25 = 11.
+  EXPECT_EQ(GetRelativeDistance(4, 25, 27), 11);
+  // 15 - 0 = 15.
+  EXPECT_EQ(GetRelativeDistance(15, 0, 27), 15);
+  // If a and b are exactly 16 apart, GetRelativeDistance() cannot tell whether
+  // a is before or after b. GetRelativeDistance(a, b) and
+  // GetRelativeDistance(b, a) are both -16.
+  // 16 - 32 = -16.
+  EXPECT_EQ(GetRelativeDistance(16, 0, 27), -16);
+  // 0 - 16 = -16.
+  EXPECT_EQ(GetRelativeDistance(0, 16, 27), -16);
+}
+
+TEST(BitstreamUtils, ApplySign) {
+  // ApplyPositive(0) = 0
+  EXPECT_EQ(ApplySign(0, 0), 0);
+  // ApplyNegative(0) = 0
+  EXPECT_EQ(ApplySign(0, -1), 0);
+
+  // ApplyPositive(1) = 1
+  EXPECT_EQ(ApplySign(1, 0), 1);
+  // ApplyNegative(1) = -1
+  EXPECT_EQ(ApplySign(1, -1), -1);
+
+  // ApplyPositive(-1) = -1
+  EXPECT_EQ(ApplySign(-1, 0), -1);
+  // ApplyNegative(-1) = 1
+  EXPECT_EQ(ApplySign(-1, -1), 1);
+
+  // ApplyPositive(1234) = 1234
+  EXPECT_EQ(ApplySign(1234, 0), 1234);
+  // ApplyNegative(1234) = -1234
+  EXPECT_EQ(ApplySign(1234, -1), -1234);
+
+  // ApplyPositive(-1234) = -1234
+  EXPECT_EQ(ApplySign(-1234, 0), -1234);
+  // ApplyNegative(-1234) = 1234
+  EXPECT_EQ(ApplySign(-1234, -1), 1234);
+}
+
+// 7.9.3. (without the clamp for numerator and denominator).
+int SpecGetMvProjectionKernel(int mv, int numerator, int denominator) {
+  int value = mv * numerator * kProjectionMvDivisionLookup[denominator];
+  if (value >= 0) {
+    value += 1 << 13;
+    value >>= 14;
+  } else {
+    value = -value;
+    value += 1 << 13;
+    value >>= 14;
+    value = -value;
+  }
+  if (value < (-(1 << 14) + 1)) value = -(1 << 14) + 1;
+  if (value > (1 << 14) - 1) value = (1 << 14) - 1;
+  return value;
+}
+
+void SpecGetMvProjectionNoClamp(const MotionVector& mv, int numerator,
+                                int denominator, MotionVector* projection_mv) {
+  for (int i = 0; i < 2; ++i) {
+    projection_mv->mv[i] =
+        SpecGetMvProjectionKernel(mv.mv[i], numerator, denominator);
+  }
+}
+
+TEST(BitstreamUtils, GetMvProjection) {
+  const int16_t mvs[5][2] = {
+      {0, 0}, {11, 73}, {-84, 272}, {733, -827}, {-472, -697}};
+  for (auto& mv_value : mvs) {
+    for (int numerator = -kMaxFrameDistance; numerator <= kMaxFrameDistance;
+         ++numerator) {
+      for (int denominator = 0; denominator <= kMaxFrameDistance;
+           ++denominator) {
+        MotionVector mv, projection_mv, spec_projection_mv;
+        mv.mv[0] = mv_value[0];
+        mv.mv[1] = mv_value[1];
+        GetMvProjection(mv, numerator, kProjectionMvDivisionLookup[denominator],
+                        &projection_mv);
+        SpecGetMvProjectionNoClamp(mv, numerator, denominator,
+                                   &spec_projection_mv);
+        EXPECT_EQ(projection_mv.mv32, spec_projection_mv.mv32);
+      }
+    }
+  }
+}
+
+// 7.9.4.
+int SpecProject(int value, int delta, int dst_sign) {
+  constexpr int kMiSizeLog2 = 2;
+  const int sign = (dst_sign == 0) ? 1 : dst_sign;
+  int offset;
+  if (delta >= 0) {
+    offset = delta >> (3 + 1 + kMiSizeLog2);
+  } else {
+    offset = -((-delta) >> (3 + 1 + kMiSizeLog2));
+  }
+  return value + sign * offset;
+}
+
+TEST(BitstreamUtils, Project) {
+  for (int value = -10; value <= 10; ++value) {
+    for (int delta = -256; delta <= 256; ++delta) {
+      for (int dst_sign = -1; dst_sign <= 0; ++dst_sign) {
+        EXPECT_EQ(Project(value, delta, dst_sign),
+                  SpecProject(value, delta, dst_sign));
+      }
+    }
+  }
+}
+
+TEST(BitstreamUtils, IsBlockSmallerThan8x8) {
+  static constexpr bool is_block_smaller_than8x8[kMaxBlockSizes] = {
+      true,  true,  false, true,  false, false, false, false,
+      false, false, false, false, false, false, false, false,
+      false, false, false, false, false, false,
+  };
+  for (int i = 0; i < kMaxBlockSizes; ++i) {
+    EXPECT_EQ(IsBlockSmallerThan8x8(static_cast<BlockSize>(i)),
+              is_block_smaller_than8x8[i])
+        << "Mismatch at index " << i;
+  }
+}
+
+TEST(BitstreamUtils, TransformSizeToSquareTransformIndex) {
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize4x4), 0);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize8x8), 1);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize16x16), 2);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize32x32), 3);
+  EXPECT_EQ(TransformSizeToSquareTransformIndex(kTransformSize64x64), 4);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h
new file mode 100644
index 0000000..09f0035
--- /dev/null
+++ b/src/utils/compiler_attributes.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
+
+// A collection of compiler attribute checks and defines to control for
+// compatibility across toolchains.
+
+//------------------------------------------------------------------------------
+// Language version, attribute and feature helpers.
+
+// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default
+// unless compiled with /Zc:__cplusplus, use the value controlled by /std
+// instead.
+// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define LIBGAV1_CXX17 1
+#else
+#define LIBGAV1_CXX17 0
+#endif
+
+#if defined(__has_attribute)
+#define LIBGAV1_HAS_ATTRIBUTE __has_attribute
+#else
+#define LIBGAV1_HAS_ATTRIBUTE(x) 0
+#endif
+
+#if defined(__has_feature)
+#define LIBGAV1_HAS_FEATURE __has_feature
+#else
+#define LIBGAV1_HAS_FEATURE(x) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBGAV1_ASAN 1
+#else
+#define LIBGAV1_ASAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(memory_sanitizer)
+#define LIBGAV1_MSAN 1
+#else
+#define LIBGAV1_MSAN 0
+#endif
+
+#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__)
+#define LIBGAV1_TSAN 1
+#else
+#define LIBGAV1_TSAN 0
+#endif
+
+//------------------------------------------------------------------------------
+// AddressSanitizer support.
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if LIBGAV1_ASAN
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  (static_cast<void>(addr), static_cast<void>(size))
+#endif
+
+//------------------------------------------------------------------------------
+// Function attributes.
+// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+// Clang: https://clang.llvm.org/docs/AttributeReference.html
+
+#if defined(__GNUC__)
+#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define LIBGAV1_ALWAYS_INLINE __forceinline
+#else
+#define LIBGAV1_ALWAYS_INLINE inline
+#endif
+
+// LIBGAV1_MUST_USE_RESULT
+//
+// Tells the compiler to warn about unused results.
+//
+// When annotating a function, it must appear as the first part of the
+// declaration or definition. The compiler will warn if the return value from
+// such a function is unused:
+//
+//   LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket();
+//   AllocateSprocket();  // Triggers a warning.
+//
+// When annotating a class, it is equivalent to annotating every function which
+// returns an instance.
+//
+//   class LIBGAV1_MUST_USE_RESULT Sprocket {};
+//   Sprocket();  // Triggers a warning.
+//
+//   Sprocket MakeSprocket();
+//   MakeSprocket();  // Triggers a warning.
+//
+// Note that references and pointers are not instances:
+//
+//   Sprocket* SprocketPointer();
+//   SprocketPointer();  // Does *not* trigger a warning.
+//
+// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused
+// result warning. For that, warn_unused_result is used only for clang but not
+// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425
+#if LIBGAV1_HAS_ATTRIBUTE(nodiscard)
+#define LIBGAV1_MUST_USE_RESULT [[nodiscard]]
+#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result)
+#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result))
+#else
+#define LIBGAV1_MUST_USE_RESULT
+#endif
+
+// LIBGAV1_PRINTF_ATTRIBUTE
+//
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+//
+// Note: As the GCC manual states, "[s]ince non-static C++ methods
+// have an implicit 'this' argument, the arguments of such methods
+// should be counted from two, not one."
+#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__))
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#else
+#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check)
+#endif
+
+//------------------------------------------------------------------------------
+// Thread annotations.
+
+// LIBGAV1_GUARDED_BY()
+//
+// Documents if a shared field or global variable needs to be protected by a
+// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex
+// that should be held when accessing the annotated variable.
+//
+// Although this annotation cannot be applied to local variables, a local
+// variable and its associated mutex can often be combined into a small class
+// or struct, thereby allowing the annotation.
+//
+// Example:
+//
+//   class Foo {
+//     Mutex mu_;
+//     int p1_ LIBGAV1_GUARDED_BY(mu_);
+//     ...
+//   };
+// TODO(b/133245043): this can be reenabled after a local MutexLock
+// implementation is added with proper thread annotations.
+#if 0  // LIBGAV1_HAS_ATTRIBUTE(guarded_by)
+#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x)))
+#else
+#define LIBGAV1_GUARDED_BY(x)
+#endif
+
+//------------------------------------------------------------------------------
+
+#undef LIBGAV1_HAS_ATTRIBUTE
+#undef LIBGAV1_HAS_FEATURE
+
+#endif  // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_
diff --git a/src/utils/constants.cc b/src/utils/constants.cc
new file mode 100644
index 0000000..80d7acb
--- /dev/null
+++ b/src/utils/constants.cc
@@ -0,0 +1,874 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+                                               2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5};
+
+const uint8_t k4x4HeightLog2[kMaxBlockSizes] = {
+    0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5};
+
+const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32};
+
+const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32};
+
+const uint8_t kBlockWidthPixels[kMaxBlockSizes] = {
+    4,  4,  4,  8,  8,  8,  8,  16, 16, 16,  16,
+    16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128};
+
+const uint8_t kBlockHeightPixels[kMaxBlockSizes] = {
+    4,  8, 16, 4,  8,  16, 32, 4,  8,   16, 32,
+    64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128};
+
+// 9.3 -- Partition_Subsize[]
+const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = {
+    // kPartitionNone
+    {kBlock4x4,     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64,   kBlockInvalid,
+     kBlockInvalid, kBlock128x128},
+    // kPartitionHorizontal
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVertical
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32,   kBlockInvalid,
+     kBlockInvalid, kBlock64x64},
+    // kPartitionHorizontalWithTopSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionHorizontalWithBottomSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32,   kBlockInvalid,
+     kBlockInvalid, kBlock128x64},
+    // kPartitionVerticalWithLeftSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionVerticalWithRightSplit
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64,   kBlockInvalid,
+     kBlockInvalid, kBlock64x128},
+    // kPartitionHorizontal4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid},
+    // kPartitionVertical4
+    {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32,
+     kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64,   kBlockInvalid,
+     kBlockInvalid, kBlockInvalid}};
+
+// 5.11.38 (implemented as a simple look up. first dimension is block size,
+// second and third are subsampling_x and subsampling_y).
+const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = {
+    {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}},
+    {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}},
+    {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}},
+    {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}},
+    {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}},
+    {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}},
+    {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}},
+    {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}},
+    {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}},
+    {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}},
+    {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}},
+    {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}},
+    {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}},
+    {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}},
+    {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}},
+    {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}},
+    {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}},
+    {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}},
+    {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}},
+    {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}},
+    {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}};
+
+const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = {
+    0,    16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638,
+    1489, 1365,  1260, 1170, 1092, 1024, 963,  910,  862,  819,  780,
+    744,  712,   682,  655,  630,  606,  585,  564,  546,  528};
+
+const uint8_t kTransformWidth[kNumTransformSizes] = {
+    4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64};
+
+const uint8_t kTransformHeight[kNumTransformSizes] = {
+    4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64};
+
+const uint8_t kTransformWidth4x4[kNumTransformSizes] = {
+    1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16};
+
+const uint8_t kTransformHeight4x4[kNumTransformSizes] = {
+    1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16};
+
+const uint8_t kTransformWidthLog2[kNumTransformSizes] = {
+    2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6};
+
+const uint8_t kTransformHeightLog2[kNumTransformSizes] = {
+    2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6};
+
+// 9.3 -- Split_Tx_Size[]
+const TransformSize kSplitTransformSize[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x8,
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize8x16,  kTransformSize8x4,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize16x32,
+    kTransformSize16x8,  kTransformSize16x16, kTransformSize16x16,
+    kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
+    kTransformSize32x32};
+
+// Square transform of size min(w,h).
+const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize4x4,   kTransformSize4x4,
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize8x8,
+    kTransformSize8x8,   kTransformSize4x4,   kTransformSize8x8,
+    kTransformSize16x16, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize16x16, kTransformSize32x32,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize32x32,
+    kTransformSize64x64};
+
+// Square transform of size max(w,h).
+const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = {
+    kTransformSize4x4,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize8x8,   kTransformSize8x8,   kTransformSize16x16,
+    kTransformSize32x32, kTransformSize16x16, kTransformSize16x16,
+    kTransformSize16x16, kTransformSize32x32, kTransformSize64x64,
+    kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
+    kTransformSize64x64, kTransformSize64x64, kTransformSize64x64,
+    kTransformSize64x64};
+
+const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2};
+
+const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = {
+    {2, 12, 1, 4},  {2, 15, 1, 6},  {2, 18, 1, 8},  {2, 21, 1, 9},
+    {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13},
+    {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5},   {0, 0, 1, 8},
+    {0, 0, 1, 11},  {0, 0, 1, 14},  {2, 30, 0, 0},  {2, 75, 0, 0}};
+
+const int8_t kSgrProjMultiplierMin[2] = {-96, -32};
+
+const int8_t kSgrProjMultiplierMax[2] = {31, 95};
+
+const int8_t kWienerTapsMin[3] = {-5, -23, -17};
+
+const int8_t kWienerTapsMax[3] = {10, 8, 46};
+
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed NEON operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+    kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},    {0, 0, 1, 128, 2, 1, 0, 0},
+        {0, 1, 3, 127, 4, 2, 1, 0},    {0, 1, 4, 127, 6, 3, 1, 0},
+        {0, 2, 6, 126, 8, 3, 1, 0},    {0, 2, 7, 125, 11, 4, 1, 0},
+        {1, 2, 8, 125, 13, 5, 2, 0},   {1, 3, 9, 124, 15, 6, 2, 0},
+        {1, 3, 10, 123, 18, 6, 2, 1},  {1, 3, 11, 122, 20, 7, 3, 1},
+        {1, 4, 12, 121, 22, 8, 3, 1},  {1, 4, 13, 120, 25, 9, 3, 1},
+        {1, 4, 14, 118, 28, 9, 3, 1},  {1, 4, 15, 117, 30, 10, 4, 1},
+        {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+        {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+        {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+        {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+        {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+        {1, 6, 20, 97, 58, 17, 6, 1},  {1, 6, 20, 95, 61, 18, 6, 1},
+        {2, 7, 20, 93, 64, 18, 6, 2},  {2, 7, 20, 91, 66, 19, 6, 1},
+        {2, 7, 20, 88, 69, 19, 6, 1},  {2, 7, 20, 86, 71, 19, 6, 1},
+        {2, 7, 20, 84, 74, 20, 7, 2},  {2, 7, 20, 81, 76, 20, 7, 1},
+        {2, 7, 20, 79, 79, 20, 7, 2},  {1, 7, 20, 76, 81, 20, 7, 2},
+        {2, 7, 20, 74, 84, 20, 7, 2},  {1, 6, 19, 71, 86, 20, 7, 2},
+        {1, 6, 19, 69, 88, 20, 7, 2},  {1, 6, 19, 66, 91, 20, 7, 2},
+        {2, 6, 18, 64, 93, 20, 7, 2},  {1, 6, 18, 61, 95, 20, 6, 1},
+        {1, 6, 17, 58, 97, 20, 6, 1},  {1, 6, 17, 56, 99, 20, 6, 1},
+        {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+        {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+        {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+        {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+        {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+        {1, 3, 9, 28, 118, 14, 4, 1},  {1, 3, 9, 25, 120, 13, 4, 1},
+        {1, 3, 8, 22, 121, 12, 4, 1},  {1, 3, 7, 20, 122, 11, 3, 1},
+        {1, 2, 6, 18, 123, 10, 3, 1},  {0, 2, 6, 15, 124, 9, 3, 1},
+        {0, 2, 5, 13, 125, 8, 2, 1},   {0, 1, 4, 11, 125, 7, 2, 0},
+        {0, 1, 3, 8, 126, 6, 2, 0},    {0, 1, 3, 6, 127, 4, 1, 0},
+        {0, 1, 2, 4, 127, 3, 1, 0},    {0, 0, 1, 2, 128, 1, 0, 0},
+};
+
+alignas(8) const int8_t
+    kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+alignas(16) const int16_t
+    kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = {
+        // [-1, 0).
+        {0, 0, 127, 1, 0, 0, 0, 0},
+        {0, -1, 127, 2, 0, 0, 0, 0},
+        {1, -3, 127, 4, -1, 0, 0, 0},
+        {1, -4, 126, 6, -2, 1, 0, 0},
+        {1, -5, 126, 8, -3, 1, 0, 0},
+        {1, -6, 125, 11, -4, 1, 0, 0},
+        {1, -7, 124, 13, -4, 1, 0, 0},
+        {2, -8, 123, 15, -5, 1, 0, 0},
+        {2, -9, 122, 18, -6, 1, 0, 0},
+        {2, -10, 121, 20, -6, 1, 0, 0},
+        {2, -11, 120, 22, -7, 2, 0, 0},
+        {2, -12, 119, 25, -8, 2, 0, 0},
+        {3, -13, 117, 27, -8, 2, 0, 0},
+        {3, -13, 116, 29, -9, 2, 0, 0},
+        {3, -14, 114, 32, -10, 3, 0, 0},
+        {3, -15, 113, 35, -10, 2, 0, 0},
+        {3, -15, 111, 37, -11, 3, 0, 0},
+        {3, -16, 109, 40, -11, 3, 0, 0},
+        {3, -16, 108, 42, -12, 3, 0, 0},
+        {4, -17, 106, 45, -13, 3, 0, 0},
+        {4, -17, 104, 47, -13, 3, 0, 0},
+        {4, -17, 102, 50, -14, 3, 0, 0},
+        {4, -17, 100, 52, -14, 3, 0, 0},
+        {4, -18, 98, 55, -15, 4, 0, 0},
+        {4, -18, 96, 58, -15, 3, 0, 0},
+        {4, -18, 94, 60, -16, 4, 0, 0},
+        {4, -18, 91, 63, -16, 4, 0, 0},
+        {4, -18, 89, 65, -16, 4, 0, 0},
+        {4, -18, 87, 68, -17, 4, 0, 0},
+        {4, -18, 85, 70, -17, 4, 0, 0},
+        {4, -18, 82, 73, -17, 4, 0, 0},
+        {4, -18, 80, 75, -17, 4, 0, 0},
+        {4, -18, 78, 78, -18, 4, 0, 0},
+        {4, -17, 75, 80, -18, 4, 0, 0},
+        {4, -17, 73, 82, -18, 4, 0, 0},
+        {4, -17, 70, 85, -18, 4, 0, 0},
+        {4, -17, 68, 87, -18, 4, 0, 0},
+        {4, -16, 65, 89, -18, 4, 0, 0},
+        {4, -16, 63, 91, -18, 4, 0, 0},
+        {4, -16, 60, 94, -18, 4, 0, 0},
+        {3, -15, 58, 96, -18, 4, 0, 0},
+        {4, -15, 55, 98, -18, 4, 0, 0},
+        {3, -14, 52, 100, -17, 4, 0, 0},
+        {3, -14, 50, 102, -17, 4, 0, 0},
+        {3, -13, 47, 104, -17, 4, 0, 0},
+        {3, -13, 45, 106, -17, 4, 0, 0},
+        {3, -12, 42, 108, -16, 3, 0, 0},
+        {3, -11, 40, 109, -16, 3, 0, 0},
+        {3, -11, 37, 111, -15, 3, 0, 0},
+        {2, -10, 35, 113, -15, 3, 0, 0},
+        {3, -10, 32, 114, -14, 3, 0, 0},
+        {2, -9, 29, 116, -13, 3, 0, 0},
+        {2, -8, 27, 117, -13, 3, 0, 0},
+        {2, -8, 25, 119, -12, 2, 0, 0},
+        {2, -7, 22, 120, -11, 2, 0, 0},
+        {1, -6, 20, 121, -10, 2, 0, 0},
+        {1, -6, 18, 122, -9, 2, 0, 0},
+        {1, -5, 15, 123, -8, 2, 0, 0},
+        {1, -4, 13, 124, -7, 1, 0, 0},
+        {1, -4, 11, 125, -6, 1, 0, 0},
+        {1, -3, 8, 126, -5, 1, 0, 0},
+        {1, -2, 6, 126, -4, 1, 0, 0},
+        {0, -1, 4, 127, -3, 1, 0, 0},
+        {0, 0, 2, 127, -1, 0, 0, 0},
+        // [0, 1).
+        {0, 0, 0, 127, 1, 0, 0, 0},
+        {0, 0, -1, 127, 2, 0, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},
+        {0, 1, -5, 127, 6, -2, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},
+        {-1, 2, -7, 126, 11, -4, 2, -1},
+        {-1, 3, -8, 125, 13, -5, 2, -1},
+        {-1, 3, -10, 124, 16, -6, 3, -1},
+        {-1, 4, -11, 123, 18, -7, 3, -1},
+        {-1, 4, -12, 122, 20, -7, 3, -1},
+        {-1, 4, -13, 121, 23, -8, 3, -1},
+        {-2, 5, -14, 120, 25, -9, 4, -1},
+        {-1, 5, -15, 119, 27, -10, 4, -1},
+        {-1, 5, -16, 118, 30, -11, 4, -1},
+        {-2, 6, -17, 116, 33, -12, 5, -1},
+        {-2, 6, -17, 114, 35, -12, 5, -1},
+        {-2, 6, -18, 113, 38, -13, 5, -1},
+        {-2, 7, -19, 111, 41, -14, 6, -2},
+        {-2, 7, -19, 110, 43, -15, 6, -2},
+        {-2, 7, -20, 108, 46, -15, 6, -2},
+        {-2, 7, -20, 106, 49, -16, 6, -2},
+        {-2, 7, -21, 104, 51, -16, 7, -2},
+        {-2, 7, -21, 102, 54, -17, 7, -2},
+        {-2, 8, -21, 100, 56, -18, 7, -2},
+        {-2, 8, -22, 98, 59, -18, 7, -2},
+        {-2, 8, -22, 96, 62, -19, 7, -2},
+        {-2, 8, -22, 94, 64, -19, 7, -2},
+        {-2, 8, -22, 91, 67, -20, 8, -2},
+        {-2, 8, -22, 89, 69, -20, 8, -2},
+        {-2, 8, -22, 87, 72, -21, 8, -2},
+        {-2, 8, -21, 84, 74, -21, 8, -2},
+        {-2, 8, -22, 82, 77, -21, 8, -2},
+        {-2, 8, -21, 79, 79, -21, 8, -2},
+        {-2, 8, -21, 77, 82, -22, 8, -2},
+        {-2, 8, -21, 74, 84, -21, 8, -2},
+        {-2, 8, -21, 72, 87, -22, 8, -2},
+        {-2, 8, -20, 69, 89, -22, 8, -2},
+        {-2, 8, -20, 67, 91, -22, 8, -2},
+        {-2, 7, -19, 64, 94, -22, 8, -2},
+        {-2, 7, -19, 62, 96, -22, 8, -2},
+        {-2, 7, -18, 59, 98, -22, 8, -2},
+        {-2, 7, -18, 56, 100, -21, 8, -2},
+        {-2, 7, -17, 54, 102, -21, 7, -2},
+        {-2, 7, -16, 51, 104, -21, 7, -2},
+        {-2, 6, -16, 49, 106, -20, 7, -2},
+        {-2, 6, -15, 46, 108, -20, 7, -2},
+        {-2, 6, -15, 43, 110, -19, 7, -2},
+        {-2, 6, -14, 41, 111, -19, 7, -2},
+        {-1, 5, -13, 38, 113, -18, 6, -2},
+        {-1, 5, -12, 35, 114, -17, 6, -2},
+        {-1, 5, -12, 33, 116, -17, 6, -2},
+        {-1, 4, -11, 30, 118, -16, 5, -1},
+        {-1, 4, -10, 27, 119, -15, 5, -1},
+        {-1, 4, -9, 25, 120, -14, 5, -2},
+        {-1, 3, -8, 23, 121, -13, 4, -1},
+        {-1, 3, -7, 20, 122, -12, 4, -1},
+        {-1, 3, -7, 18, 123, -11, 4, -1},
+        {-1, 3, -6, 16, 124, -10, 3, -1},
+        {-1, 2, -5, 13, 125, -8, 3, -1},
+        {-1, 2, -4, 11, 126, -7, 2, -1},
+        {0, 1, -3, 8, 126, -6, 2, 0},
+        {0, 1, -2, 6, 127, -5, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},
+        {0, 0, 0, 2, 127, -1, 0, 0},
+        // [1, 2).
+        {0, 0, 0, 1, 127, 0, 0, 0},
+        {0, 0, 0, -1, 127, 2, 0, 0},
+        {0, 0, 1, -3, 127, 4, -1, 0},
+        {0, 0, 1, -4, 126, 6, -2, 1},
+        {0, 0, 1, -5, 126, 8, -3, 1},
+        {0, 0, 1, -6, 125, 11, -4, 1},
+        {0, 0, 1, -7, 124, 13, -4, 1},
+        {0, 0, 2, -8, 123, 15, -5, 1},
+        {0, 0, 2, -9, 122, 18, -6, 1},
+        {0, 0, 2, -10, 121, 20, -6, 1},
+        {0, 0, 2, -11, 120, 22, -7, 2},
+        {0, 0, 2, -12, 119, 25, -8, 2},
+        {0, 0, 3, -13, 117, 27, -8, 2},
+        {0, 0, 3, -13, 116, 29, -9, 2},
+        {0, 0, 3, -14, 114, 32, -10, 3},
+        {0, 0, 3, -15, 113, 35, -10, 2},
+        {0, 0, 3, -15, 111, 37, -11, 3},
+        {0, 0, 3, -16, 109, 40, -11, 3},
+        {0, 0, 3, -16, 108, 42, -12, 3},
+        {0, 0, 4, -17, 106, 45, -13, 3},
+        {0, 0, 4, -17, 104, 47, -13, 3},
+        {0, 0, 4, -17, 102, 50, -14, 3},
+        {0, 0, 4, -17, 100, 52, -14, 3},
+        {0, 0, 4, -18, 98, 55, -15, 4},
+        {0, 0, 4, -18, 96, 58, -15, 3},
+        {0, 0, 4, -18, 94, 60, -16, 4},
+        {0, 0, 4, -18, 91, 63, -16, 4},
+        {0, 0, 4, -18, 89, 65, -16, 4},
+        {0, 0, 4, -18, 87, 68, -17, 4},
+        {0, 0, 4, -18, 85, 70, -17, 4},
+        {0, 0, 4, -18, 82, 73, -17, 4},
+        {0, 0, 4, -18, 80, 75, -17, 4},
+        {0, 0, 4, -18, 78, 78, -18, 4},
+        {0, 0, 4, -17, 75, 80, -18, 4},
+        {0, 0, 4, -17, 73, 82, -18, 4},
+        {0, 0, 4, -17, 70, 85, -18, 4},
+        {0, 0, 4, -17, 68, 87, -18, 4},
+        {0, 0, 4, -16, 65, 89, -18, 4},
+        {0, 0, 4, -16, 63, 91, -18, 4},
+        {0, 0, 4, -16, 60, 94, -18, 4},
+        {0, 0, 3, -15, 58, 96, -18, 4},
+        {0, 0, 4, -15, 55, 98, -18, 4},
+        {0, 0, 3, -14, 52, 100, -17, 4},
+        {0, 0, 3, -14, 50, 102, -17, 4},
+        {0, 0, 3, -13, 47, 104, -17, 4},
+        {0, 0, 3, -13, 45, 106, -17, 4},
+        {0, 0, 3, -12, 42, 108, -16, 3},
+        {0, 0, 3, -11, 40, 109, -16, 3},
+        {0, 0, 3, -11, 37, 111, -15, 3},
+        {0, 0, 2, -10, 35, 113, -15, 3},
+        {0, 0, 3, -10, 32, 114, -14, 3},
+        {0, 0, 2, -9, 29, 116, -13, 3},
+        {0, 0, 2, -8, 27, 117, -13, 3},
+        {0, 0, 2, -8, 25, 119, -12, 2},
+        {0, 0, 2, -7, 22, 120, -11, 2},
+        {0, 0, 1, -6, 20, 121, -10, 2},
+        {0, 0, 1, -6, 18, 122, -9, 2},
+        {0, 0, 1, -5, 15, 123, -8, 2},
+        {0, 0, 1, -4, 13, 124, -7, 1},
+        {0, 0, 1, -4, 11, 125, -6, 1},
+        {0, 0, 1, -3, 8, 126, -5, 1},
+        {0, 0, 1, -2, 6, 126, -4, 1},
+        {0, 0, 0, -1, 4, 127, -3, 1},
+        {0, 0, 0, 0, 2, 127, -1, 0},
+        // dummy, replicate row index 191.
+        {0, 0, 0, 0, 2, 127, -1, 0}};
+
+// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify
+// calculations by reducing the range by 1 bit.
+alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, -3, 63, 4, -1, 0, 0},
+     {0, 1, -5, 61, 9, -2, 0, 0},
+     {0, 1, -6, 58, 14, -4, 1, 0},
+     {0, 1, -7, 55, 19, -5, 1, 0},
+     {0, 1, -7, 51, 24, -6, 1, 0},
+     {0, 1, -8, 47, 29, -6, 1, 0},
+     {0, 1, -7, 42, 33, -6, 1, 0},
+     {0, 1, -7, 38, 38, -7, 1, 0},
+     {0, 1, -6, 33, 42, -7, 1, 0},
+     {0, 1, -6, 29, 47, -8, 1, 0},
+     {0, 1, -6, 24, 51, -7, 1, 0},
+     {0, 1, -5, 19, 55, -7, 1, 0},
+     {0, 1, -4, 14, 58, -6, 1, 0},
+     {0, 0, -2, 9, 61, -5, 1, 0},
+     {0, 0, -1, 4, 63, -3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, -1, 8, 27, 24, 6, 0, 0},
+     {0, -1, 7, 26, 26, 7, -1, 0},
+     {0, 0, 6, 24, 27, 8, -1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {-1, 1, -3, 63, 4, -1, 1, 0},
+     {-1, 3, -6, 62, 8, -3, 2, -1},
+     {-1, 4, -9, 60, 13, -5, 3, -1},
+     {-2, 5, -11, 58, 19, -7, 3, -1},
+     {-2, 5, -11, 54, 24, -9, 4, -1},
+     {-2, 5, -12, 50, 30, -10, 4, -1},
+     {-2, 5, -12, 45, 35, -11, 5, -1},
+     {-2, 6, -12, 40, 40, -12, 6, -2},
+     {-1, 5, -11, 35, 45, -12, 5, -2},
+     {-1, 4, -10, 30, 50, -12, 5, -2},
+     {-1, 4, -9, 24, 54, -11, 5, -2},
+     {-1, 3, -7, 19, 58, -11, 5, -2},
+     {-1, 3, -5, 13, 60, -9, 4, -1},
+     {-1, 2, -3, 8, 62, -6, 3, -1},
+     {0, 1, -1, 4, 63, -3, 1, -1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, -2, 63, 4, -1, 0, 0},
+     {0, 0, -4, 61, 9, -2, 0, 0},
+     {0, 0, -5, 58, 14, -3, 0, 0},
+     {0, 0, -6, 55, 19, -4, 0, 0},
+     {0, 0, -6, 51, 24, -5, 0, 0},
+     {0, 0, -7, 47, 29, -5, 0, 0},
+     {0, 0, -6, 42, 33, -5, 0, 0},
+     {0, 0, -6, 38, 38, -6, 0, 0},
+     {0, 0, -5, 33, 42, -6, 0, 0},
+     {0, 0, -5, 29, 47, -7, 0, 0},
+     {0, 0, -5, 24, 51, -6, 0, 0},
+     {0, 0, -4, 19, 55, -6, 0, 0},
+     {0, 0, -3, 14, 58, -5, 0, 0},
+     {0, 0, -2, 9, 61, -4, 0, 0},
+     {0, 0, -1, 4, 63, -2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know
+// the pattern of the signs and account for it in other ways.
+const uint8_t kAbsHalfSubPixelFilters[6][16][8] = {
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 3, 63, 4, 1, 0, 0},
+     {0, 1, 5, 61, 9, 2, 0, 0},
+     {0, 1, 6, 58, 14, 4, 1, 0},
+     {0, 1, 7, 55, 19, 5, 1, 0},
+     {0, 1, 7, 51, 24, 6, 1, 0},
+     {0, 1, 8, 47, 29, 6, 1, 0},
+     {0, 1, 7, 42, 33, 6, 1, 0},
+     {0, 1, 7, 38, 38, 7, 1, 0},
+     {0, 1, 6, 33, 42, 7, 1, 0},
+     {0, 1, 6, 29, 47, 8, 1, 0},
+     {0, 1, 6, 24, 51, 7, 1, 0},
+     {0, 1, 5, 19, 55, 7, 1, 0},
+     {0, 1, 4, 14, 58, 6, 1, 0},
+     {0, 0, 2, 9, 61, 5, 1, 0},
+     {0, 0, 1, 4, 63, 3, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 1, 14, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 1, 8, 27, 24, 6, 0, 0},
+     {0, 1, 7, 26, 26, 7, 1, 0},
+     {0, 0, 6, 24, 27, 8, 1, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 14, 1, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {1, 1, 3, 63, 4, 1, 1, 0},
+     {1, 3, 6, 62, 8, 3, 2, 1},
+     {1, 4, 9, 60, 13, 5, 3, 1},
+     {2, 5, 11, 58, 19, 7, 3, 1},
+     {2, 5, 11, 54, 24, 9, 4, 1},
+     {2, 5, 12, 50, 30, 10, 4, 1},
+     {2, 5, 12, 45, 35, 11, 5, 1},
+     {2, 6, 12, 40, 40, 12, 6, 2},
+     {1, 5, 11, 35, 45, 12, 5, 2},
+     {1, 4, 10, 30, 50, 12, 5, 2},
+     {1, 4, 9, 24, 54, 11, 5, 2},
+     {1, 3, 7, 19, 58, 11, 5, 2},
+     {1, 3, 5, 13, 60, 9, 4, 1},
+     {1, 2, 3, 8, 62, 6, 3, 1},
+     {0, 1, 1, 4, 63, 3, 1, 1}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 0, 60, 4, 0, 0, 0},
+     {0, 0, 0, 56, 8, 0, 0, 0},
+     {0, 0, 0, 52, 12, 0, 0, 0},
+     {0, 0, 0, 48, 16, 0, 0, 0},
+     {0, 0, 0, 44, 20, 0, 0, 0},
+     {0, 0, 0, 40, 24, 0, 0, 0},
+     {0, 0, 0, 36, 28, 0, 0, 0},
+     {0, 0, 0, 32, 32, 0, 0, 0},
+     {0, 0, 0, 28, 36, 0, 0, 0},
+     {0, 0, 0, 24, 40, 0, 0, 0},
+     {0, 0, 0, 20, 44, 0, 0, 0},
+     {0, 0, 0, 16, 48, 0, 0, 0},
+     {0, 0, 0, 12, 52, 0, 0, 0},
+     {0, 0, 0, 8, 56, 0, 0, 0},
+     {0, 0, 0, 4, 60, 0, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 2, 63, 4, 1, 0, 0},
+     {0, 0, 4, 61, 9, 2, 0, 0},
+     {0, 0, 5, 58, 14, 3, 0, 0},
+     {0, 0, 6, 55, 19, 4, 0, 0},
+     {0, 0, 6, 51, 24, 5, 0, 0},
+     {0, 0, 7, 47, 29, 5, 0, 0},
+     {0, 0, 6, 42, 33, 5, 0, 0},
+     {0, 0, 6, 38, 38, 6, 0, 0},
+     {0, 0, 5, 33, 42, 6, 0, 0},
+     {0, 0, 5, 29, 47, 7, 0, 0},
+     {0, 0, 5, 24, 51, 6, 0, 0},
+     {0, 0, 4, 19, 55, 6, 0, 0},
+     {0, 0, 3, 14, 58, 5, 0, 0},
+     {0, 0, 2, 9, 61, 4, 0, 0},
+     {0, 0, 1, 4, 63, 2, 0, 0}},
+    {{0, 0, 0, 64, 0, 0, 0, 0},
+     {0, 0, 15, 31, 17, 1, 0, 0},
+     {0, 0, 13, 31, 18, 2, 0, 0},
+     {0, 0, 11, 31, 20, 2, 0, 0},
+     {0, 0, 10, 30, 21, 3, 0, 0},
+     {0, 0, 9, 29, 22, 4, 0, 0},
+     {0, 0, 8, 28, 23, 5, 0, 0},
+     {0, 0, 7, 27, 24, 6, 0, 0},
+     {0, 0, 6, 26, 26, 6, 0, 0},
+     {0, 0, 6, 24, 27, 7, 0, 0},
+     {0, 0, 5, 23, 28, 8, 0, 0},
+     {0, 0, 4, 22, 29, 9, 0, 0},
+     {0, 0, 3, 21, 30, 10, 0, 0},
+     {0, 0, 2, 20, 31, 11, 0, 0},
+     {0, 0, 2, 18, 31, 13, 0, 0},
+     {0, 0, 1, 17, 31, 15, 0, 0}}};
+
+// 9.3 -- Dr_Intra_Derivative[]
+// This is a more compact version of the table from the spec. angle / 2 - 1 is
+// used as the lookup. Note angle / 3 - 1 would work too, but the calculation
+// becomes more costly.
+const int16_t kDirectionalIntraPredictorDerivative[44] = {
+    //              Approx angle
+    1023, 0,     // 3, ...
+    547,         // 6, ...
+    372,  0, 0,  // 9, ...
+    273,         // 14, ...
+    215,  0,     // 17, ...
+    178,         // 20, ...
+    151,  0,     // 23, ... (113 & 203 are base angles)
+    132,         // 26, ...
+    116,  0,     // 29, ...
+    102,  0,     // 32, ...
+    90,          // 36, ...
+    80,   0,     // 39, ...
+    71,          // 42, ...
+    64,   0,     // 45, ... (45 & 135 are base angles)
+    57,          // 48, ...
+    51,   0,     // 51, ...
+    45,   0,     // 54, ...
+    40,          // 58, ...
+    35,   0,     // 61, ...
+    31,          // 64, ...
+    27,   0,     // 67, ... (67 & 157 are base angles)
+    23,          // 70, ...
+    19,   0,     // 73, ...
+    15,   0,     // 76, ...
+    11,   0,     // 81, ...
+    7,           // 84, ...
+    3,           // 87, ...
+};
+
+const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
+    {0, 1}, {2, 2}, {3, 3}};
+
+}  // namespace libgav1
diff --git a/src/utils/constants.h b/src/utils/constants.h
new file mode 100644
index 0000000..1126ad6
--- /dev/null
+++ b/src/utils/constants.h
@@ -0,0 +1,795 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_
+#define LIBGAV1_SRC_UTILS_CONSTANTS_H_
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/utils/bit_mask_set.h"
+
+namespace libgav1 {
+
+// Returns the number of elements between begin (inclusive) and end (inclusive).
+constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; }
+
+enum {
+// Maximum number of threads that the library will ever create.
+#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0
+  kMaxThreads = LIBGAV1_MAX_THREADS
+#else
+  kMaxThreads = 128
+#endif
+};  // anonymous enum
+
+enum {
+  kInvalidMvValue = -32768,
+  kCdfMaxProbability = 32768,
+  kBlockWidthCount = 5,
+  kMaxSegments = 8,
+  kMinQuantizer = 0,
+  kMinLossyQuantizer = 1,
+  kMaxQuantizer = 255,
+  // Quantizer matrix is used only when level < 15.
+  kNumQuantizerLevelsForQuantizerMatrix = 15,
+  kFrameLfCount = 4,
+  kMaxLoopFilterValue = 63,
+  kNum4x4In64x64 = 256,
+  kMaxAngleDelta = 3,
+  kDirectionalIntraModes = 8,
+  kMaxSuperBlockSizeLog2 = 7,
+  kMinSuperBlockSizeLog2 = 6,
+  kGlobalMotionReadControl = 3,
+  kSuperResScaleNumerator = 8,
+  kBooleanSymbolCount = 2,
+  kRestorationTypeSymbolCount = 3,
+  kSgrProjParamsBits = 4,
+  kSgrProjPrecisionBits = 7,
+  // Padding on left and right side of a restoration block.
+  // 3 is enough, but padding to 4 is more efficient, and makes the temporary
+  // source buffer 8-pixel aligned.
+  kRestorationHorizontalBorder = 4,
+  // Padding on top and bottom side of a restoration block.
+  kRestorationVerticalBorder = 2,
+  kCdefBorder = 2,             // Padding on each side of a cdef block.
+  kConvolveBorderLeftTop = 3,  // Left/top padding of a convolve block.
+  // Right/bottom padding of a convolve block. This needs to be 4 at minimum,
+  // but was increased to simplify the SIMD loads in
+  // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON().
+  kConvolveBorderRight = 8,
+  kConvolveScaleBorderRight = 15,
+  kConvolveBorderBottom = 4,
+  kSubPixelTaps = 8,
+  kWienerFilterBits = 7,
+  kWienerFilterTaps = 7,
+  kMaxPaletteSize = 8,
+  kMinPaletteSize = 2,
+  kMaxPaletteSquare = 64,
+  kBorderPixels = 64,
+  // The final blending process for film grain needs room to overwrite and read
+  // with SIMD instructions. The maximum overwrite is 7 pixels, but the border
+  // is required to be a multiple of 32 by YuvBuffer::Realloc, so that
+  // subsampled chroma borders are 16-aligned.
+  kBorderPixelsFilmGrain = 32,
+  // These constants are the minimum left, right, top, and bottom border sizes
+  // in pixels as an extension of the frame boundary. The minimum border sizes
+  // are derived from the following requirements:
+  // - Warp_C() may read up to 13 pixels before or after a row.
+  // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14
+  //   pixels after a row, but the value of the last read pixel is not used.
+  // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and
+  //   13 pixels below the bottom row.
+  kMinLeftBorderPixels = 13,
+  kMinRightBorderPixels = 13,
+  kMinTopBorderPixels = 13,
+  kMinBottomBorderPixels = 13,
+  kWarpedModelPrecisionBits = 16,
+  kMaxRefMvStackSize = 8,
+  kMaxLeastSquaresSamples = 8,
+  kMaxTemporalMvCandidates = 19,
+  // The SIMD implementations of motion vection projection functions always
+  // process 2 or 4 elements together, so we pad the corresponding buffers to
+  // size 20.
+  kMaxTemporalMvCandidatesWithPadding = 20,
+  kMaxSuperBlockSizeInPixels = 128,
+  kMaxScaledSuperBlockSizeInPixels = 128 * 2,
+  kMaxSuperBlockSizeSquareInPixels = 128 * 128,
+  kNum4x4InLoopFilterUnit = 16,
+  kNum4x4InLoopRestorationUnit = 16,
+  kProjectionMvClamp = (1 << 14) - 1,  // == 16383
+  kProjectionMvMaxHorizontalOffset = 8,
+  kCdefUnitSize = 64,
+  kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder,
+  kRestorationUnitOffset = 8,
+  // Loop restoration's processing unit size is fixed as 64x64.
+  kRestorationUnitHeight = 64,
+  kRestorationUnitWidth = 256,
+  kRestorationUnitHeightWithBorders =
+      kRestorationUnitHeight + 2 * kRestorationVerticalBorder,
+  kRestorationUnitWidthWithBorders =
+      kRestorationUnitWidth + 2 * kRestorationHorizontalBorder,
+  kSuperResFilterBits = 6,
+  kSuperResFilterShifts = 1 << kSuperResFilterBits,
+  kSuperResFilterTaps = 8,
+  kSuperResScaleBits = 14,
+  kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
+  kSuperResScaleMask = (1 << 14) - 1,
+  kSuperResHorizontalBorder = 4,
+  kSuperResVerticalBorder = 1,
+  // The SIMD implementations of superres calculate up to 15 extra upscaled
+  // pixels which will over-read up to 15 downscaled pixels in the end of each
+  // row. Set the padding to 16 for alignment purposes.
+  kSuperResHorizontalPadding = 16,
+  // TODO(chengchen): consider merging these constants:
+  // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
+  // They are designed to match AV1 convolution, which increases coeff
+  // values up to 7 bits. We could consider to combine them and use kFilterBits
+  // only.
+  kFilterBits = 7,
+  // Sub pixel is used in AV1 to represent a pixel location that is not at
+  // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of
+  // integer pixel. Sub pixel values are interpolated using adjacent integer
+  // pixel values. The interpolation is a filtering process.
+  kSubPixelBits = 4,
+  kSubPixelMask = (1 << kSubPixelBits) - 1,
+  // Precision bits when computing inter prediction locations.
+  kScaleSubPixelBits = 10,
+  kWarpParamRoundingBits = 6,
+  // Number of fractional bits of lookup in divisor lookup table.
+  kDivisorLookupBits = 8,
+  // Number of fractional bits of entries in divisor lookup table.
+  kDivisorLookupPrecisionBits = 14,
+  // Number of phases used in warped filtering.
+  kWarpedPixelPrecisionShifts = 1 << 6,
+  kResidualPaddingVertical = 4,
+  kWedgeMaskMasterSize = 64,
+  kMaxFrameDistance = 31,
+  kReferenceFrameScalePrecision = 14,
+  kNumWienerCoefficients = 3,
+  kLoopFilterMaxModeDeltas = 2,
+  kMaxCdefStrengths = 8,
+  kCdefLargeValue = 0x4000,  // Used to indicate where CDEF is not available.
+  kMaxTileColumns = 64,
+  kMaxTileRows = 64,
+  kMaxOperatingPoints = 32,
+  // There can be a maximum of 4 spatial layers and 8 temporal layers.
+  kMaxLayers = 32,
+  // The cache line size should ideally be queried at run time. 64 is a common
+  // cache line size of x86 CPUs. Web searches showed the cache line size of ARM
+  // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all
+  // CPUs that we care about, even though it is excessive for some ARM
+  // CPUs.
+  //
+  // On Linux, the cache line size can be looked up with the command:
+  //   getconf LEVEL1_DCACHE_LINESIZE
+  kCacheLineSize = 64,
+};  // anonymous enum
+
+enum FrameType : uint8_t {
+  kFrameKey,
+  kFrameInter,
+  kFrameIntraOnly,
+  kFrameSwitch
+};
+
+enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV };
+enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 };
+
+// The plane types, called luma and chroma in the spec.
+enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes };
+
+enum ReferenceFrameType : int8_t {
+  kReferenceFrameNone = -1,
+  kReferenceFrameIntra,
+  kReferenceFrameLast,
+  kReferenceFrameLast2,
+  kReferenceFrameLast3,
+  kReferenceFrameGolden,
+  kReferenceFrameBackward,
+  kReferenceFrameAlternate2,
+  kReferenceFrameAlternate,
+  kNumReferenceFrameTypes,
+  kNumInterReferenceFrameTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate),
+  kNumForwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden),
+  kNumBackwardReferenceTypes =
+      EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate)
+};
+
+enum {
+  // Unidirectional compound reference pairs that are signaled explicitly:
+  // {kReferenceFrameLast, kReferenceFrameLast2},
+  // {kReferenceFrameLast, kReferenceFrameLast3},
+  // {kReferenceFrameLast, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate}
+  kExplicitUnidirectionalCompoundReferences = 4,
+  // Other unidirectional compound reference pairs:
+  // {kReferenceFrameLast2, kReferenceFrameLast3},
+  // {kReferenceFrameLast2, kReferenceFrameGolden},
+  // {kReferenceFrameLast3, kReferenceFrameGolden},
+  // {kReferenceFrameBackward, kReferenceFrameAlternate2},
+  // {kReferenceFrameAlternate2, kReferenceFrameAlternate}
+  kUnidirectionalCompoundReferences =
+      kExplicitUnidirectionalCompoundReferences + 5,
+};  // anonymous enum
+
+enum BlockSize : uint8_t {
+  kBlock4x4,
+  kBlock4x8,
+  kBlock4x16,
+  kBlock8x4,
+  kBlock8x8,
+  kBlock8x16,
+  kBlock8x32,
+  kBlock16x4,
+  kBlock16x8,
+  kBlock16x16,
+  kBlock16x32,
+  kBlock16x64,
+  kBlock32x8,
+  kBlock32x16,
+  kBlock32x32,
+  kBlock32x64,
+  kBlock64x16,
+  kBlock64x32,
+  kBlock64x64,
+  kBlock64x128,
+  kBlock128x64,
+  kBlock128x128,
+  kMaxBlockSizes,
+  kBlockInvalid
+};
+
+//  Partition types.  R: Recursive
+//
+//  None          Horizontal    Vertical      Split
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  Horizontal    Horizontal    Vertical      Vertical
+//  with top      with bottom   with left     with right
+//  split         split         split         split
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  Horizontal4   Vertical4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
+enum Partition : uint8_t {
+  kPartitionNone,
+  kPartitionHorizontal,
+  kPartitionVertical,
+  kPartitionSplit,
+  kPartitionHorizontalWithTopSplit,
+  kPartitionHorizontalWithBottomSplit,
+  kPartitionVerticalWithLeftSplit,
+  kPartitionVerticalWithRightSplit,
+  kPartitionHorizontal4,
+  kPartitionVertical4
+};
+enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 };
+
+enum PredictionMode : uint8_t {
+  // Intra prediction modes.
+  kPredictionModeDc,
+  kPredictionModeVertical,
+  kPredictionModeHorizontal,
+  kPredictionModeD45,
+  kPredictionModeD135,
+  kPredictionModeD113,
+  kPredictionModeD157,
+  kPredictionModeD203,
+  kPredictionModeD67,
+  kPredictionModeSmooth,
+  kPredictionModeSmoothVertical,
+  kPredictionModeSmoothHorizontal,
+  kPredictionModePaeth,
+  kPredictionModeChromaFromLuma,
+  // Single inter prediction modes.
+  kPredictionModeNearestMv,
+  kPredictionModeNearMv,
+  kPredictionModeGlobalMv,
+  kPredictionModeNewMv,
+  // Compound inter prediction modes.
+  kPredictionModeNearestNearestMv,
+  kPredictionModeNearNearMv,
+  kPredictionModeNearestNewMv,
+  kPredictionModeNewNearestMv,
+  kPredictionModeNearNewMv,
+  kPredictionModeNewNearMv,
+  kPredictionModeGlobalGlobalMv,
+  kPredictionModeNewNewMv,
+  kNumPredictionModes,
+  kNumCompoundInterPredictionModes =
+      EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv),
+  kIntraPredictionModesY =
+      EnumRangeLength(kPredictionModeDc, kPredictionModePaeth),
+  kIntraPredictionModesUV =
+      EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma),
+  kPredictionModeInvalid = 255
+};
+
+enum InterIntraMode : uint8_t {
+  kInterIntraModeDc,
+  kInterIntraModeVertical,
+  kInterIntraModeHorizontal,
+  kInterIntraModeSmooth,
+  kNumInterIntraModes
+};
+
+enum MotionMode : uint8_t {
+  kMotionModeSimple,
+  kMotionModeObmc,  // Overlapped block motion compensation.
+  kMotionModeLocalWarp,
+  kNumMotionModes
+};
+
+enum TxMode : uint8_t {
+  kTxModeOnly4x4,
+  kTxModeLargest,
+  kTxModeSelect,
+  kNumTxModes
+};
+
+// These enums are named as kType1Type2 where Type1 is the transform type for
+// the rows and Type2 is the transform type for the columns.
+enum TransformType : uint8_t {
+  kTransformTypeDctDct,
+  kTransformTypeAdstDct,
+  kTransformTypeDctAdst,
+  kTransformTypeAdstAdst,
+  kTransformTypeFlipadstDct,
+  kTransformTypeDctFlipadst,
+  kTransformTypeFlipadstFlipadst,
+  kTransformTypeAdstFlipadst,
+  kTransformTypeFlipadstAdst,
+  kTransformTypeIdentityIdentity,
+  kTransformTypeIdentityDct,
+  kTransformTypeDctIdentity,
+  kTransformTypeIdentityAdst,
+  kTransformTypeAdstIdentity,
+  kTransformTypeIdentityFlipadst,
+  kTransformTypeFlipadstIdentity,
+  kNumTransformTypes
+};
+
+constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct,
+                                               kTransformTypeFlipadstAdst,
+                                               kTransformTypeFlipadstIdentity,
+                                               kTransformTypeFlipadstFlipadst);
+constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst,
+                                            kTransformTypeAdstFlipadst,
+                                            kTransformTypeIdentityFlipadst,
+                                            kTransformTypeFlipadstFlipadst);
+
+enum TransformSize : uint8_t {
+  kTransformSize4x4,
+  kTransformSize4x8,
+  kTransformSize4x16,
+  kTransformSize8x4,
+  kTransformSize8x8,
+  kTransformSize8x16,
+  kTransformSize8x32,
+  kTransformSize16x4,
+  kTransformSize16x8,
+  kTransformSize16x16,
+  kTransformSize16x32,
+  kTransformSize16x64,
+  kTransformSize32x8,
+  kTransformSize32x16,
+  kTransformSize32x32,
+  kTransformSize32x64,
+  kTransformSize64x16,
+  kTransformSize64x32,
+  kTransformSize64x64,
+  kNumTransformSizes
+};
+
+enum TransformSet : uint8_t {
+  // DCT Only (1).
+  kTransformSetDctOnly,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (7).
+  kTransformSetIntra1,
+  // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5).
+  kTransformSetIntra2,
+  // All transforms = Total (16).
+  kTransformSetInter1,
+  // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical
+  // DCT (2) = Total (12).
+  kTransformSetInter2,
+  // DCT (1) + Identity (1) = Total (2).
+  kTransformSetInter3,
+  kNumTransformSets
+};
+
+enum TransformClass : uint8_t {
+  kTransformClass2D,
+  kTransformClassHorizontal,
+  kTransformClassVertical,
+  kNumTransformClasses
+};
+
+enum FilterIntraPredictor : uint8_t {
+  kFilterIntraPredictorDc,
+  kFilterIntraPredictorVertical,
+  kFilterIntraPredictorHorizontal,
+  kFilterIntraPredictorD157,
+  kFilterIntraPredictorPaeth,
+  kNumFilterIntraPredictors
+};
+
+enum ObmcDirection : uint8_t {
+  kObmcDirectionVertical,
+  kObmcDirectionHorizontal,
+  kNumObmcDirections
+};
+
+// In AV1 the name of the filter refers to the direction of filter application.
+// Horizontal refers to the column edge and vertical the row edge.
+enum LoopFilterType : uint8_t {
+  kLoopFilterTypeVertical,
+  kLoopFilterTypeHorizontal,
+  kNumLoopFilterTypes
+};
+
+enum LoopFilterTransformSizeId : uint8_t {
+  kLoopFilterTransformSizeId4x4,
+  kLoopFilterTransformSizeId8x8,
+  kLoopFilterTransformSizeId16x16,
+  kNumLoopFilterTransformSizeIds
+};
+
+enum LoopRestorationType : uint8_t {
+  kLoopRestorationTypeNone,
+  kLoopRestorationTypeSwitchable,
+  kLoopRestorationTypeWiener,
+  kLoopRestorationTypeSgrProj,  // self guided projection filter.
+  kNumLoopRestorationTypes
+};
+
+enum CompoundReferenceType : uint8_t {
+  kCompoundReferenceUnidirectional,
+  kCompoundReferenceBidirectional,
+  kNumCompoundReferenceTypes
+};
+
+enum CompoundPredictionType : uint8_t {
+  kCompoundPredictionTypeWedge,
+  kCompoundPredictionTypeDiffWeighted,
+  kCompoundPredictionTypeAverage,
+  kCompoundPredictionTypeIntra,
+  kCompoundPredictionTypeDistance,
+  kNumCompoundPredictionTypes,
+  // Number of compound prediction types that are explicitly signaled in the
+  // bitstream (in the compound_type syntax element).
+  kNumExplicitCompoundPredictionTypes = 2
+};
+
+enum InterpolationFilter : uint8_t {
+  kInterpolationFilterEightTap,
+  kInterpolationFilterEightTapSmooth,
+  kInterpolationFilterEightTapSharp,
+  kInterpolationFilterBilinear,
+  kInterpolationFilterSwitchable,
+  kNumInterpolationFilters,
+  // Number of interpolation filters that can be explicitly signaled in the
+  // compressed headers (when the uncompressed headers allow switchable
+  // interpolation filters) of the bitstream.
+  kNumExplicitInterpolationFilters = EnumRangeLength(
+      kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp)
+};
+
+enum MvJointType : uint8_t {
+  kMvJointTypeZero,
+  kMvJointTypeHorizontalNonZeroVerticalZero,
+  kMvJointTypeHorizontalZeroVerticalNonZero,
+  kMvJointTypeNonZero,
+  kNumMvJointTypes
+};
+
+enum ObuType : int8_t {
+  kObuInvalid = -1,
+  kObuSequenceHeader = 1,
+  kObuTemporalDelimiter = 2,
+  kObuFrameHeader = 3,
+  kObuTileGroup = 4,
+  kObuMetadata = 5,
+  kObuFrame = 6,
+  kObuRedundantFrameHeader = 7,
+  kObuTileList = 8,
+  kObuPadding = 15,
+};
+
+constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth,
+                                               kPredictionModeSmoothHorizontal,
+                                               kPredictionModeSmoothVertical);
+
+//------------------------------------------------------------------------------
+// ToString()
+//
+// These functions are meant to be used only in debug logging and within tests.
+// They are defined inline to avoid including the strings in the release
+// library when logging is disabled; unreferenced functions will not be added to
+// any object file in that case.
+
+inline const char* ToString(const BlockSize size) {
+  switch (size) {
+    case kBlock4x4:
+      return "kBlock4x4";
+    case kBlock4x8:
+      return "kBlock4x8";
+    case kBlock4x16:
+      return "kBlock4x16";
+    case kBlock8x4:
+      return "kBlock8x4";
+    case kBlock8x8:
+      return "kBlock8x8";
+    case kBlock8x16:
+      return "kBlock8x16";
+    case kBlock8x32:
+      return "kBlock8x32";
+    case kBlock16x4:
+      return "kBlock16x4";
+    case kBlock16x8:
+      return "kBlock16x8";
+    case kBlock16x16:
+      return "kBlock16x16";
+    case kBlock16x32:
+      return "kBlock16x32";
+    case kBlock16x64:
+      return "kBlock16x64";
+    case kBlock32x8:
+      return "kBlock32x8";
+    case kBlock32x16:
+      return "kBlock32x16";
+    case kBlock32x32:
+      return "kBlock32x32";
+    case kBlock32x64:
+      return "kBlock32x64";
+    case kBlock64x16:
+      return "kBlock64x16";
+    case kBlock64x32:
+      return "kBlock64x32";
+    case kBlock64x64:
+      return "kBlock64x64";
+    case kBlock64x128:
+      return "kBlock64x128";
+    case kBlock128x64:
+      return "kBlock128x64";
+    case kBlock128x128:
+      return "kBlock128x128";
+    case kMaxBlockSizes:
+      return "kMaxBlockSizes";
+    case kBlockInvalid:
+      return "kBlockInvalid";
+  }
+  abort();
+}
+
+inline const char* ToString(const InterIntraMode mode) {
+  switch (mode) {
+    case kInterIntraModeDc:
+      return "kInterIntraModeDc";
+    case kInterIntraModeVertical:
+      return "kInterIntraModeVertical";
+    case kInterIntraModeHorizontal:
+      return "kInterIntraModeHorizontal";
+    case kInterIntraModeSmooth:
+      return "kInterIntraModeSmooth";
+    case kNumInterIntraModes:
+      return "kNumInterIntraModes";
+  }
+  abort();
+}
+
+inline const char* ToString(const ObmcDirection direction) {
+  switch (direction) {
+    case kObmcDirectionVertical:
+      return "kObmcDirectionVertical";
+    case kObmcDirectionHorizontal:
+      return "kObmcDirectionHorizontal";
+    case kNumObmcDirections:
+      return "kNumObmcDirections";
+  }
+  abort();
+}
+
+inline const char* ToString(const LoopRestorationType type) {
+  switch (type) {
+    case kLoopRestorationTypeNone:
+      return "kLoopRestorationTypeNone";
+    case kLoopRestorationTypeSwitchable:
+      return "kLoopRestorationTypeSwitchable";
+    case kLoopRestorationTypeWiener:
+      return "kLoopRestorationTypeWiener";
+    case kLoopRestorationTypeSgrProj:
+      return "kLoopRestorationTypeSgrProj";
+    case kNumLoopRestorationTypes:
+      return "kNumLoopRestorationTypes";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformSize size) {
+  switch (size) {
+    case kTransformSize4x4:
+      return "kTransformSize4x4";
+    case kTransformSize4x8:
+      return "kTransformSize4x8";
+    case kTransformSize4x16:
+      return "kTransformSize4x16";
+    case kTransformSize8x4:
+      return "kTransformSize8x4";
+    case kTransformSize8x8:
+      return "kTransformSize8x8";
+    case kTransformSize8x16:
+      return "kTransformSize8x16";
+    case kTransformSize8x32:
+      return "kTransformSize8x32";
+    case kTransformSize16x4:
+      return "kTransformSize16x4";
+    case kTransformSize16x8:
+      return "kTransformSize16x8";
+    case kTransformSize16x16:
+      return "kTransformSize16x16";
+    case kTransformSize16x32:
+      return "kTransformSize16x32";
+    case kTransformSize16x64:
+      return "kTransformSize16x64";
+    case kTransformSize32x8:
+      return "kTransformSize32x8";
+    case kTransformSize32x16:
+      return "kTransformSize32x16";
+    case kTransformSize32x32:
+      return "kTransformSize32x32";
+    case kTransformSize32x64:
+      return "kTransformSize32x64";
+    case kTransformSize64x16:
+      return "kTransformSize64x16";
+    case kTransformSize64x32:
+      return "kTransformSize64x32";
+    case kTransformSize64x64:
+      return "kTransformSize64x64";
+    case kNumTransformSizes:
+      return "kNumTransformSizes";
+  }
+  abort();
+}
+
+inline const char* ToString(const TransformType type) {
+  switch (type) {
+    case kTransformTypeDctDct:
+      return "kTransformTypeDctDct";
+    case kTransformTypeAdstDct:
+      return "kTransformTypeAdstDct";
+    case kTransformTypeDctAdst:
+      return "kTransformTypeDctAdst";
+    case kTransformTypeAdstAdst:
+      return "kTransformTypeAdstAdst";
+    case kTransformTypeFlipadstDct:
+      return "kTransformTypeFlipadstDct";
+    case kTransformTypeDctFlipadst:
+      return "kTransformTypeDctFlipadst";
+    case kTransformTypeFlipadstFlipadst:
+      return "kTransformTypeFlipadstFlipadst";
+    case kTransformTypeAdstFlipadst:
+      return "kTransformTypeAdstFlipadst";
+    case kTransformTypeFlipadstAdst:
+      return "kTransformTypeFlipadstAdst";
+    case kTransformTypeIdentityIdentity:
+      return "kTransformTypeIdentityIdentity";
+    case kTransformTypeIdentityDct:
+      return "kTransformTypeIdentityDct";
+    case kTransformTypeDctIdentity:
+      return "kTransformTypeDctIdentity";
+    case kTransformTypeIdentityAdst:
+      return "kTransformTypeIdentityAdst";
+    case kTransformTypeAdstIdentity:
+      return "kTransformTypeAdstIdentity";
+    case kTransformTypeIdentityFlipadst:
+      return "kTransformTypeIdentityFlipadst";
+    case kTransformTypeFlipadstIdentity:
+      return "kTransformTypeFlipadstIdentity";
+    // case to quiet compiler
+    case kNumTransformTypes:
+      return "kNumTransformTypes";
+  }
+  abort();
+}
+
+//------------------------------------------------------------------------------
+
+extern const uint8_t k4x4WidthLog2[kMaxBlockSizes];
+
+extern const uint8_t k4x4HeightLog2[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes];
+
+extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes];
+
+extern const uint8_t kBlockWidthPixels[kMaxBlockSizes];
+
+extern const uint8_t kBlockHeightPixels[kMaxBlockSizes];
+
+extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes];
+
+extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2];
+
+extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1];
+
+extern const uint8_t kTransformWidth[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight[kNumTransformSizes];
+
+extern const uint8_t kTransformWidth4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformHeight4x4[kNumTransformSizes];
+
+extern const uint8_t kTransformWidthLog2[kNumTransformSizes];
+
+extern const uint8_t kTransformHeightLog2[kNumTransformSizes];
+
+extern const TransformSize kSplitTransformSize[kNumTransformSizes];
+
+// Square transform of size min(w,h).
+extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes];
+
+// Square transform of size max(w,h).
+extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes];
+
+extern const uint8_t kNumTransformTypesInSet[kNumTransformSets];
+
+extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4];
+
+extern const int8_t kSgrProjMultiplierMin[2];
+
+extern const int8_t kSgrProjMultiplierMax[2];
+
+extern const int8_t kWienerTapsMin[3];
+
+extern const int8_t kWienerTapsMax[3];
+
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+                                           [kSuperResFilterTaps];
+
+// An int8_t version of the kWarpedFilters array.
+// Note: The array could be removed with a performance penalty.
+extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8];
+
+extern const int8_t kHalfSubPixelFilters[6][16][8];
+
+extern const uint8_t kAbsHalfSubPixelFilters[6][16][8];
+
+extern const int16_t kDirectionalIntraPredictorDerivative[44];
+
+extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CONSTANTS_H_
diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc
new file mode 100644
index 0000000..b3c51da
--- /dev/null
+++ b/src/utils/cpu.cc
@@ -0,0 +1,84 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#include <cpuid.h>
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#include <immintrin.h>  // _xgetbv
+#include <intrin.h>
+#endif
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
+namespace {
+
+#if defined(__GNUC__)
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]);
+}
+
+uint64_t Xgetbv() {
+  const uint32_t ecx = 0;  // ecx specifies the extended control register
+  uint32_t eax;
+  uint32_t edx;
+  __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
+  return (static_cast<uint64_t>(edx) << 32) | eax;
+}
+#else   // _MSC_VER
+void CpuId(int leaf, uint32_t info[4]) {
+  __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
+}
+
+uint64_t Xgetbv() { return _xgetbv(0); }
+#endif  // __GNUC__
+
+}  // namespace
+
+uint32_t GetCpuInfo() {
+  uint32_t info[4];
+
+  // Get the highest feature value cpuid supports
+  CpuId(0, info);
+  const int max_cpuid_value = info[0];
+  if (max_cpuid_value < 1) return 0;
+
+  CpuId(1, info);
+  uint32_t features = 0;
+  if ((info[3] & (1 << 26)) != 0) features |= kSSE2;
+  if ((info[2] & (1 << 9)) != 0) features |= kSSSE3;
+  if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1;
+
+  // Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((info[2] & (3 << 27)) == (3 << 27)) {
+    // XMM state and YMM state enabled by the OS
+    if ((Xgetbv() & 0x6) == 0x6) {
+      features |= kAVX;
+      if (max_cpuid_value >= 7) {
+        CpuId(7, info);
+        if ((info[1] & (1 << 5)) != 0) features |= kAVX2;
+      }
+    }
+  }
+
+  return features;
+}
+#else
+uint32_t GetCpuInfo() { return 0; }
+#endif  // x86 || x86_64
+
+}  // namespace libgav1
diff --git a/src/utils/cpu.h b/src/utils/cpu.h
new file mode 100644
index 0000000..aefc2df
--- /dev/null
+++ b/src/utils/cpu.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_CPU_H_
+#define LIBGAV1_SRC_UTILS_CPU_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
+#define LIBGAV1_X86_MSVC
+#endif
+
+#if defined(LIBGAV1_X86)
+
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif  // !defined(LIBGAV1_ENABLE_AVX2)
+#else   // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#else  // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif  // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
+
+#if !defined(LIBGAV1_ENABLE_NEON)
+// TODO(jzern): add support for _M_ARM64.
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENABLE_NEON 0
+#endif
+#endif  // !defined(LIBGAV1_ENABLE_NEON)
+
+enum CpuFeatures : uint8_t {
+  kSSE2 = 1 << 0,
+#define LIBGAV1_CPU_SSE2 (1 << 0)
+  kSSSE3 = 1 << 1,
+#define LIBGAV1_CPU_SSSE3 (1 << 1)
+  kSSE4_1 = 1 << 2,
+#define LIBGAV1_CPU_SSE4_1 (1 << 2)
+  kAVX = 1 << 3,
+#define LIBGAV1_CPU_AVX (1 << 3)
+  kAVX2 = 1 << 4,
+#define LIBGAV1_CPU_AVX2 (1 << 4)
+  kNEON = 1 << 5,
+#define LIBGAV1_CPU_NEON (1 << 5)
+};
+
+// Returns a bit-wise OR of CpuFeatures supported by this platform.
+uint32_t GetCpuInfo();
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_CPU_H_
diff --git a/src/utils/cpu_test.cc b/src/utils/cpu_test.cc
new file mode 100644
index 0000000..3a01b33
--- /dev/null
+++ b/src/utils/cpu_test.cc
@@ -0,0 +1,248 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/cpu.h"
+
+#if defined(__linux__)
+#include <unistd.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#endif  // defined(__linux__)
+
+#include "gtest/gtest.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+#if defined(__linux__)
+
+// Sample code for getting the number of performance CPU cores. The following
+// sources were consulted:
+// * https://www.kernel.org/doc/html/latest/admin-guide/cputopology.html
+// * cpu-hotplug.txt: CPU hotplug Support in Linux(tm) Kernel
+//   https://lwn.net/Articles/537570/
+// * https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
+// * Android bionic source code of get_nprocs():
+//   libc/bionic/sysinfo.cpp
+// * glibc 2.30 source code of get_nprocs():
+//   sysdeps/unix/sysv/linux/getsysstats.c
+//
+// Tested on:
+// * Asus Nexus 7 2013: Qualcomm Snapdragon 600, 32-bit Android 6.0.1
+//   (Marshmallow). Brings cores online and offline dynamically. (The tablet
+//   has 4 cores. "0", "0-1", "0-2", and "0-3" have all been observed in the
+//   /sys/devices/system/cpu/online file.) This causes the number of cores
+//   currently online to potentially be lower than the number of cores that can
+//   be brought online quickly.
+// * General Mobile 4G: Qualcomm Snapdragon 410, 32-bit Android 7.1.1 (Nougat).
+// * Motorola Moto G5 Plus: Qualcomm Snapdragon 625, 32-bit Android 8.1.0
+//   (Oreo).
+// * Motorola Moto G7 Play: Qualcomm Snapdragon 632, 32-bit Android 9 (Pie).
+//   All 8 cores have the same cpuinfo_max_freq (1804800), but there are two
+//   values of cpuinfo_min_freq: cores 0-3 have 614400 and cores 4-7 have
+//   633600. We would need to check cpuinfo_min_freq to differentiate the two
+//   kinds of cores (Qualcomm Kryo 250 Gold and Qualcomm Kryo 250 Silver).
+// * Pixel 2 XL: Qualcomm Snapdragon 835, 64-bit Android 9 (Pie).
+// * Pixel 3: Qualcomm Snapdragon 845, 64-bit Android 9 (Pie).
+// * Pixel 3a: Qualcomm Snapdragon 670, 64-bit Android 9 (Pie).
+// * Samsung Galaxy S6: Samsung Exynos 7 Octa (7420), 64-bit Android 7.0
+//   (Nougat).
+// * Samsung Galaxy S8+ (SM-G955FD): Samsung Exynos 8895, 64-bit Android 8.0.0.
+//
+// Note: The sample code needs to use the 'long' type because it is the return
+// type of the Standard C Library function strtol(). The ClangTidy warnings are
+// suppressed with NOLINT(google-runtime-int) comments.
+
+// Returns the number of online processor cores.
+int GetNumberOfProcessorsOnline() {
+  // See https://developer.android.com/ndk/guides/cpu-features.
+  long num_cpus = sysconf(_SC_NPROCESSORS_ONLN);  // NOLINT(google-runtime-int)
+  if (num_cpus < 0) {
+    LIBGAV1_DLOG(ERROR, "sysconf(_SC_NPROCESSORS_ONLN) failed: %s.",
+                 strerror(errno));
+    return 0;
+  }
+  // It is safe to cast num_cpus to int. sysconf(_SC_NPROCESSORS_ONLN) returns
+  // the return value of get_nprocs(), which is an int.
+  return static_cast<int>(num_cpus);
+}
+
+// These CPUs support heterogeneous multiprocessing.
+#if defined(__arm__) || defined(__aarch64__)
+
+// A helper function used by GetNumberOfPerformanceCoresOnline().
+//
+// Returns the cpuinfo_max_freq value (in kHz) of the given CPU. Returns 0 on
+// failure.
+long GetCpuinfoMaxFreq(int cpu_index) {  // NOLINT(google-runtime-int)
+  char buffer[128];
+  const int rv = snprintf(
+      buffer, sizeof(buffer),
+      "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpu_index);
+  if (rv < 0 || rv >= sizeof(buffer)) {
+    LIBGAV1_DLOG(ERROR, "snprintf failed, or |buffer| is too small.");
+    return 0;
+  }
+  FILE* file = fopen(buffer, "r");
+  if (file == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fopen(\"%s\", \"r\") failed: %s.", buffer,
+                 strerror(errno));
+    return 0;
+  }
+  char* const str = fgets(buffer, sizeof(buffer), file);
+  fclose(file);
+  if (str == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fgets failed.");
+    return 0;
+  }
+  const long freq = strtol(str, nullptr, 10);  // NOLINT(google-runtime-int)
+  if (freq <= 0 || freq == LONG_MAX) {
+    LIBGAV1_DLOG(ERROR,
+                 "No conversion can be performed, or the converted value is "
+                 "invalid: %ld.",
+                 freq);
+    return 0;
+  }
+  return freq;
+}
+
+// Returns the number of performance CPU cores that are online. The number of
+// efficiency CPU cores is subtracted from the total number of CPU cores. Uses
+// cpuinfo_max_freq to determine whether a CPU is a performance core or an
+// efficiency core.
+//
+// This function is not perfect. For example, the Snapdragon 632 SoC used in
+// Motorola Moto G7 has performance and efficiency cores with the same
+// cpuinfo_max_freq but different cpuinfo_min_freq. This function fails to
+// differentiate the two kinds of cores and reports all the cores as
+// performance cores.
+int GetNumberOfPerformanceCoresOnline() {
+  // Get the online CPU list. Some examples of the online CPU list are:
+  //   "0-7"
+  //   "0"
+  //   "0-1,2,3,4-7"
+  char online[512];
+  FILE* file = fopen("/sys/devices/system/cpu/online", "r");
+  if (file == nullptr) {
+    LIBGAV1_DLOG(ERROR,
+                 "fopen(\"/sys/devices/system/cpu/online\", \"r\") failed: %s.",
+                 strerror(errno));
+    return 0;
+  }
+  char* const str = fgets(online, sizeof(online), file);
+  fclose(file);
+  file = nullptr;
+  if (str == nullptr) {
+    LIBGAV1_DLOG(ERROR, "fgets failed.");
+    return 0;
+  }
+  LIBGAV1_DLOG(INFO, "The online CPU list is %s", online);
+
+  // Count the number of the slowest CPUs. Some SoCs such as Snapdragon 855
+  // have performance cores with different max frequencies, so only the slowest
+  // CPUs are efficiency cores. If we count the number of the fastest CPUs, we
+  // will fail to count the second fastest performance cores.
+  long slowest_cpu_freq = LONG_MAX;  // NOLINT(google-runtime-int)
+  int num_slowest_cpus = 0;
+  int num_cpus = 0;
+  const char* cp = online;
+  int range_begin = -1;
+  while (true) {
+    char* str_end;
+    const int cpu = static_cast<int>(strtol(cp, &str_end, 10));
+    if (str_end == cp) {
+      break;
+    }
+    cp = str_end;
+    if (*cp == '-') {
+      range_begin = cpu;
+    } else {
+      if (range_begin == -1) {
+        range_begin = cpu;
+      }
+
+      num_cpus += cpu - range_begin + 1;
+      for (int i = range_begin; i <= cpu; ++i) {
+        const long freq = GetCpuinfoMaxFreq(i);  // NOLINT(google-runtime-int)
+        if (freq <= 0) {
+          return 0;
+        }
+        LIBGAV1_DLOG(INFO, "cpu%d max frequency is %ld kHz.", i, freq);
+        if (freq < slowest_cpu_freq) {
+          slowest_cpu_freq = freq;
+          num_slowest_cpus = 0;
+        }
+        if (freq == slowest_cpu_freq) {
+          ++num_slowest_cpus;
+        }
+      }
+
+      range_begin = -1;
+    }
+    if (*cp == '\0') {
+      break;
+    }
+    ++cp;
+  }
+
+  LIBGAV1_DLOG(INFO, "There are %d CPU cores.", num_cpus);
+  LIBGAV1_DLOG(INFO,
+               "%d CPU cores are the slowest, with max frequency %ld kHz.",
+               num_slowest_cpus, slowest_cpu_freq);
+  // If there are faster CPU cores than the slowest CPU cores, exclude the
+  // slowest CPU cores.
+  if (num_slowest_cpus < num_cpus) {
+    num_cpus -= num_slowest_cpus;
+  }
+  return num_cpus;
+}
+
+#else
+
+// Assume symmetric multiprocessing.
+int GetNumberOfPerformanceCoresOnline() {
+  return GetNumberOfProcessorsOnline();
+}
+
+#endif
+
+#endif  // defined(__linux__)
+
+/*
+  Run this test with logging enabled on an Android device:
+  64-bit Android:
+    tests/run_android_test.sh --test cpu --enable_asserts
+  32-bit Android:
+    tests/run_android_test.sh --test cpu --arch arm \
+        --enable_asserts
+*/
+TEST(CpuTest, GetNumberOfPerformanceCoresOnline) {
+#if defined(__linux__)
+  const int num_cpus = GetNumberOfProcessorsOnline();
+  ASSERT_NE(num_cpus, 0);
+  LIBGAV1_DLOG(INFO, "There are %d cores online.", num_cpus);
+  const int num_performance_cpus = GetNumberOfPerformanceCoresOnline();
+  ASSERT_NE(num_performance_cpus, 0);
+  LIBGAV1_DLOG(INFO, "There are %d performance cores online.",
+               num_performance_cpus);
+#endif  // defined(__linux__)
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h
new file mode 100644
index 0000000..0694980
--- /dev/null
+++ b/src/utils/dynamic_buffer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
+
+#include <cstddef>
+#include <memory>
+#include <new>
+
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+template <typename T>
+class DynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+  const T* get() const { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_.reset(new (std::nothrow) T[size]);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+  size_t size() const { return size_; }
+
+ private:
+  std::unique_ptr<T[]> buffer_;
+  size_t size_ = 0;
+};
+
+template <typename T, int alignment>
+class AlignedDynamicBuffer {
+ public:
+  T* get() { return buffer_.get(); }
+
+  // Resizes the buffer so that it can hold at least |size| elements. Existing
+  // contents will be destroyed when resizing to a larger size.
+  //
+  // Returns true on success. If Resize() returns false, then subsequent calls
+  // to get() will return nullptr.
+  bool Resize(size_t size) {
+    if (size <= size_) return true;
+    buffer_ = MakeAlignedUniquePtr<T>(alignment, size);
+    if (buffer_ == nullptr) {
+      size_ = 0;
+      return false;
+    }
+    size_ = size;
+    return true;
+  }
+
+ private:
+  AlignedUniquePtr<T> buffer_;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_
diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc
new file mode 100644
index 0000000..3d97e69
--- /dev/null
+++ b/src/utils/entropy_decoder.cc
@@ -0,0 +1,1120 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+                  int index, int symbol_count) {
+  return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+         (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
+               const int symbol) {
+  const uint16_t count = cdf[symbol_count];
+  // rate is computed in the spec as:
+  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+  // In this case cdf[N] is |count|.
+  // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+  // symbol_count > 3. So the equation becomes:
+  //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
+  // Note that the largest value for count is 32 (it is not incremented beyond
+  // 32). So using that information:
+  //  count >> 4 is 0 for count from 0 to 15.
+  //  count >> 4 is 1 for count from 16 to 31.
+  //  count >> 4 is 2 for count == 31.
+  // Now, the equation becomes:
+  //  4 + (count >> 4) + (symbol_count > 3).
+  // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+  // with bitwise or:
+  //  (4 | (count >> 4)) + (symbol_count > 3).
+  // but using addition will allow the compiler to eliminate an operation when
+  // symbol_count is known and this function is inlined.
+  const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+  // Hints for further optimizations:
+  //
+  // 1. clang can vectorize this for loop with width 4, even though the loop
+  // contains an if-else statement. Therefore, it may be advantageous to use
+  // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+  // (a multiple of 4 that's not too small).
+  //
+  // 2. The for loop can be rewritten in the following form, which would enable
+  // clang to vectorize the loop with width 8:
+  //
+  //   const int rounding = (1 << rate) - 1;
+  //   for (int i = 0; i < symbol_count - 1; ++i) {
+  //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+  //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+  //   }
+  //
+  // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+  // integer arithmetic. The result of the unsigned subtraction is cast to a
+  // signed integer and right-shifted. This requires the right shift of a
+  // signed integer be an arithmetic shift, which is true for clang, gcc, and
+  // Visual C++.
+  assert(symbol_count - 1 > 0);
+  int i = 0;
+  do {
+    if (i < symbol) {
+      cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+    } else {
+      cdf[i] -= cdf[i] >> rate;
+    }
+  } while (++i < symbol_count - 1);
+  cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     if (i < symbol) {
+//       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+//     } else {
+//       cdf[i] -= cdf[i] >> rate;
+//     }
+//   }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+//     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+//   }
+//
+// or:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+//     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+//   }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+//      cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+//    symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+//    the cdf array. Since an invalid CDF value is written into cdf[7], the
+//    count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x4_t cdf_vec = vld1_u16(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+  const uint16x4_t index = vcreate_u16(0x0003000200010000);
+  const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+  const uint16x4_t mask = vcge_u16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+  const int16x4_t negative_rate = vdup_n_s16(-rate);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = vadd_u16(cdf_offset, delta);
+  vst1_u16(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004));
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+  const uint16x8_t delta =
+      vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+    const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+                                          vcreate_u16(0x0009000800070006));
+    const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+    const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+    const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+    const uint16x8_t delta =
+        vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+    cdf_vec = vaddq_u16(cdf_offset, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+    cdf_vec = vsubq_u16(cdf_vec, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+  uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec0 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec0);
+
+  index = vcombine_u16(vcreate_u16(0x0007000600050004),
+                       vcreate_u16(0x000b000a00090008));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+  cdf_offset = vsubq_u16(cdf_vec1, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec1 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+
+  cdf_vec = vld1q_u16(cdf + 8);
+  index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+                       vcreate_u16(0x000f000e000d000c));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  cdf_offset = vsubq_u16(cdf_vec, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 8, cdf_vec);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec = LoadLo8(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+  // i >= symbol.
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  __m128i cdf_vec = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const __m128i cdf_max_probability =
+        _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+    const __m128i index =
+        _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+    const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+    const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+    const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+    const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+    const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+    const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_add_epi16(cdf_offset, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadLo8(cdf);
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec0);
+
+  const __m128i index1 =
+      _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec0);
+
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+  const __m128i index1 =
+      _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 8, cdf_vec1);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 16, symbol);
+}
+
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline EntropyDecoder::WindowSize HostToBigEndian(
+    const EntropyDecoder::WindowSize x) {
+  static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+  return x;
+#endif
+#elif defined(_WIN32)
+  // Note Windows targets are assumed to be little endian.
+  return static_cast<EntropyDecoder::WindowSize>(
+      (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+                       : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif  // defined(__GNUC__)
+}
+
+}  // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int EntropyDecoder::kWindowSize;  // static.
+#endif
+
+EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
+                               bool allow_update_cdf)
+    : data_(data),
+      data_end_(data + size),
+      data_memcpy_end_((size >= sizeof(WindowSize))
+                           ? data + size - sizeof(WindowSize) + 1
+                           : data),
+      allow_update_cdf_(allow_update_cdf),
+      values_in_range_(kCdfMaxProbability) {
+  if (data_ < data_memcpy_end_) {
+    // This is a simplified version of PopulateBits() which loads 8 extra bits
+    // and skips the unnecessary shifts of value and window_diff_.
+    WindowSize value;
+    memcpy(&value, data_, sizeof(value));
+    data_ += sizeof(value);
+    window_diff_ = HostToBigEndian(value) ^ -1;
+    // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+    // used to restore the most significant 0 bit that would be present after
+    // PopulateBits() when we extract the first symbol value.
+    // As shown in Section 8.2.2 Initialization process for symbol decoder,
+    // which uses a fixed offset to read the symbol values, the most
+    // significant bit is always 0:
+    //   The variable numBits is set equal to Min( sz * 8, 15).
+    //   The variable buf is read using the f(numBits) parsing process.
+    //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+    //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+    bits_ = kWindowSize - 15;
+    return;
+  }
+  window_diff_ = 0;
+  bits_ = -15;
+  PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+//   * The probability is fixed at half. So some multiplications can be replaced
+//     with bit operations.
+//   * Symbol count is fixed at 2.
+int EntropyDecoder::ReadBit() {
+  const uint32_t curr =
+      ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  int bit = 1;
+  if (symbol_value >= curr) {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+    bit = 0;
+  } else {
+    values_in_range_ = curr;
+  }
+  NormalizeRange();
+  return bit;
+}
+
+int64_t EntropyDecoder::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBit());
+  } while (--bit >= 0);
+  return literal;
+}
+
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
+                               int symbol_count) {
+  const int symbol = ReadSymbolImpl(cdf, symbol_count);
+  if (allow_update_cdf_) {
+    UpdateCdf(cdf, symbol_count, symbol);
+  }
+  return symbol;
+}
+
+bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
+  assert(cdf[1] == 0);
+  const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+  if (allow_update_cdf_) {
+    const uint16_t count = cdf[2];
+    // rate is computed in the spec as:
+    //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+    // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+    //  4 + (count > 15) + (count > 31)
+    // Note that the largest value for count is 32 (it is not incremented beyond
+    // 32). So using that information:
+    //  count >> 4 is 0 for count from 0 to 15.
+    //  count >> 4 is 1 for count from 16 to 31.
+    //  count >> 4 is 2 for count == 32.
+    // Now, the equation becomes:
+    //  4 + (count >> 4).
+    // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+    // with bitwise or. So the final equation is:
+    //  4 | (count >> 4).
+    const int rate = 4 | (count >> 4);
+    if (symbol) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+    }
+    cdf[2] += static_cast<uint16_t>(count < 32);
+  }
+  return symbol;
+}
+
+bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+  return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
+  static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+  if (symbol_count == 3 || symbol_count == 4) {
+    return ReadSymbol3Or4(cdf, symbol_count);
+  }
+  int symbol;
+  if (symbol_count == 8) {
+    symbol = ReadSymbolImpl8(cdf);
+  } else if (symbol_count <= 13) {
+    symbol = ReadSymbolImpl(cdf, symbol_count);
+  } else {
+    symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+  }
+  if (allow_update_cdf_) {
+    if (symbol_count == 5) {
+      UpdateCdf5(cdf, symbol);
+    } else if (symbol_count == 7) {
+      UpdateCdf7(cdf, symbol);
+    } else if (symbol_count == 8) {
+      UpdateCdf8(cdf, symbol);
+    } else if (symbol_count == 9) {
+      UpdateCdf9(cdf, symbol);
+    } else if (symbol_count == 11) {
+      UpdateCdf11(cdf, symbol);
+    } else if (symbol_count == 13) {
+      UpdateCdf13(cdf, symbol);
+    } else if (symbol_count == 16) {
+      UpdateCdf16(cdf, symbol);
+    } else {
+      UpdateCdf(cdf, symbol_count, symbol);
+    }
+  }
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
+                                   int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  --symbol_count;
+  uint32_t curr = values_in_range_;
+  int symbol = -1;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  do {
+    prev = curr;
+    curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    delta -= kMinimumProbabilityPerSymbol;
+  } while (symbol_value < curr);
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImplBinarySearch(
+    const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  assert(symbol_count > 1 && symbol_count <= 16);
+  --symbol_count;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+  // search to do this. Let |symbol| be the index of the first |cdf| array
+  // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+  // binary search maintains the invariant:
+  //   low <= symbol <= high + 1
+  // and terminates when low == high + 1.
+  int low = 0;
+  int high = symbol_count - 1;
+  // The binary search maintains the invariants that |prev| is the scaled cdf
+  // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+  // convention, the scaled cdf value for -1 is values_in_range_.) When the
+  // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+  // and |curr| is the scaled cdf value for |symbol|.
+  uint32_t prev = values_in_range_;
+  uint32_t curr = 0;
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+  do {
+    const int mid = DivideBy2(low + high);
+    const uint32_t scaled_cdf =
+        ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+    if (symbol_value < scaled_cdf) {
+      low = mid + 1;
+      prev = scaled_cdf;
+    } else {
+      high = mid - 1;
+      curr = scaled_cdf;
+    }
+  } while (low <= high);
+  assert(low == high + 1);
+  // At this point, |low| is the symbol that has been decoded.
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return low;
+}
+
+int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  const uint32_t curr =
+      (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+      kMinimumProbabilityPerSymbol;
+  const int symbol = static_cast<int>(symbol_value < curr);
+  if (symbol == 1) {
+    values_in_range_ = curr;
+  } else {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  }
+  NormalizeRange();
+  return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
+                                   const int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+  // array.
+  //
+  // The original code is:
+  //
+  //  int symbol = -1;
+  //  do {
+  //    prev = curr;
+  //    curr =
+  //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //        + delta;
+  //    delta -= kMinimumProbabilityPerSymbol;
+  //  } while (symbol_value < curr);
+  //  if (allow_update_cdf_) {
+  //    UpdateCdf(cdf, [3,4], symbol);
+  //  }
+  //
+  // The do-while loop is unrolled with three or four iterations, and the
+  // UpdateCdf call is inlined and merged into the iterations.
+  int symbol = 0;
+  // Iteration 0.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 0.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+        // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+        // NEON code is slower. Consider using the C version if __arm__ is
+        // defined.
+        // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+        // Samsung Galaxy S8+ (SM-G955FD).
+        uint16x4_t cdf_vec = vld1_u16(cdf);
+        const int16x4_t negative_rate = vdup_n_s16(-rate);
+        const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+        cdf_vec = vsub_u16(cdf_vec, delta);
+        vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        __m128i cdf_vec = LoadLo8(cdf);
+        const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+        cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+        StoreLo8(cdf, cdf_vec);
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+        cdf[2] -= cdf[2] >> rate;
+#endif
+      } else {  // symbol_count == 3.
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+      }
+    }
+    goto found;
+  }
+  ++symbol;
+  delta -= kMinimumProbabilityPerSymbol;
+  // Iteration 1.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 1.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+      if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+    }
+    goto found;
+  }
+  ++symbol;
+  if (symbol_count == 4) {
+    delta -= kMinimumProbabilityPerSymbol;
+    // Iteration 2.
+    prev = curr;
+    curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    if (symbol_value >= curr) {
+      // symbol == 2.
+      if (allow_update_cdf_) {
+        // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+        const uint16_t count = cdf[4];
+        cdf[4] += static_cast<uint16_t>(count < 32);
+        const int rate = (count >> 4) + 5;
+        cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+        cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+        cdf[2] -= cdf[2] >> rate;
+      }
+      goto found;
+    }
+    ++symbol;
+  }
+  // |delta| is 0 for the last iteration.
+  // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+  prev = curr;
+  // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == [2,3].
+  if (allow_update_cdf_) {
+    // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+    const uint16_t count = cdf[symbol_count];
+    cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+    const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+    if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+      // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+      // code is a tiny bit slower. Consider using the C version if __arm__ is
+      // defined.
+      uint16x4_t cdf_vec = vld1_u16(cdf);
+      const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+      const int16x4_t diff =
+          vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+      const int16x4_t negative_rate = vdup_n_s16(-rate);
+      const uint16x4_t delta =
+          vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+      cdf_vec = vadd_u16(cdf_vec, delta);
+      vst1_u16(cdf, cdf_vec);
+      cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      __m128i cdf_vec = LoadLo8(cdf);
+      const __m128i cdf_max_probability =
+          _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+      const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+      const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+      cdf_vec = _mm_add_epi16(cdf_vec, delta);
+      StoreLo8(cdf, cdf_vec);
+      cdf[3] = 0;
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+      cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+    } else {  // symbol_count == 3.
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    }
+  }
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int EntropyDecoder::ReadSymbolImpl8(
+    const uint16_t* LIBGAV1_RESTRICT const cdf) {
+  assert(cdf[7] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  //
+  // The original code is:
+  //
+  // int symbol = -1;
+  // do {
+  //   prev = curr;
+  //   curr =
+  //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //       + delta;
+  //   delta -= kMinimumProbabilityPerSymbol;
+  // } while (symbol_value < curr);
+  //
+  // The do-while loop is unrolled with eight iterations.
+  int symbol = 0;
+
+#define READ_SYMBOL_ITERATION                                                \
+  prev = curr;                                                               \
+  curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+         delta;                                                              \
+  if (symbol_value >= curr) goto found;                                      \
+  ++symbol;                                                                  \
+  delta -= kMinimumProbabilityPerSymbol
+
+  READ_SYMBOL_ITERATION;  // Iteration 0.
+  READ_SYMBOL_ITERATION;  // Iteration 1.
+  READ_SYMBOL_ITERATION;  // Iteration 2.
+  READ_SYMBOL_ITERATION;  // Iteration 3.
+  READ_SYMBOL_ITERATION;  // Iteration 4.
+  READ_SYMBOL_ITERATION;  // Iteration 5.
+
+  // The last two iterations can be simplified, so they don't use the
+  // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+  // Iteration 6.
+  prev = curr;
+  curr =
+      (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) goto found;  // symbol == 6.
+  ++symbol;
+  // |delta| is 0 for the last iteration.
+  // Iteration 7.
+  prev = curr;
+  // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == 7.
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+void EntropyDecoder::PopulateBits() {
+  constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+  // Fast path: read eight bytes and add the first six bytes to window_diff_.
+  // This fast path makes the following assumptions.
+  // 1. We assume that unaligned load of uint64_t is fast.
+  // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+  //    bytes depending on the value of bits_. This fast path always reads 6
+  //    bytes, which results in more calls to PopulateBits(). We assume that
+  //    making more calls to a faster PopulateBits() is overall a win.
+  // NOTE: Although this fast path could also be used on x86_64, it hurts
+  // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+  // reason is still unknown.) Therefore this fast path is only used on arm64.
+  static_assert(kWindowSize == 64, "");
+  if (data_ < data_memcpy_end_) {
+    uint64_t value;
+    // arm64 supports unaligned loads, so this memcpy call is compiled to a
+    // single ldr instruction.
+    memcpy(&value, data_, sizeof(value));
+    data_ += kMaxCachedBits >> 3;
+    value = HostToBigEndian(value) ^ -1;
+    value >>= kWindowSize - kMaxCachedBits;
+    window_diff_ = value | (window_diff_ << kMaxCachedBits);
+    bits_ += kMaxCachedBits;
+    return;
+  }
+#endif
+
+  const uint8_t* data = data_;
+  int bits = bits_;
+  WindowSize window_diff = window_diff_;
+
+  int count = kWindowSize - 9 - (bits + 15);
+  // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+  // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+  // iterations when WindowSize is 64 bits. So it is not profitable to
+  // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+  // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+  for (; count >= 0 && data < data_end_; count -= 8) {
+    const uint8_t value = *data++ ^ -1;
+    window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+    bits += 8;
+  }
+  assert(bits <= kMaxCachedBits);
+  if (data == data_end_) {
+    // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+    window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+    bits = kMaxCachedBits;
+  }
+
+  data_ = data;
+  bits_ = bits;
+  window_diff_ = window_diff;
+}
+
+void EntropyDecoder::NormalizeRange() {
+  const int bits_used = 15 ^ FloorLog2(values_in_range_);
+  bits_ -= bits_used;
+  values_in_range_ <<= bits_used;
+  if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h
new file mode 100644
index 0000000..8eeaef4
--- /dev/null
+++ b/src/utils/entropy_decoder.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+class EntropyDecoder final : public BitReader {
+ public:
+  // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+  // largest type with fast arithmetic. size_t should meet these requirements.
+  using WindowSize = size_t;
+
+  EntropyDecoder(const uint8_t* data, size_t size, bool allow_update_cdf);
+  ~EntropyDecoder() override = default;
+
+  // Move only.
+  EntropyDecoder(EntropyDecoder&& rhs) noexcept;
+  EntropyDecoder& operator=(EntropyDecoder&& rhs) noexcept;
+
+  int ReadBit() override;
+  int64_t ReadLiteral(int num_bits) override;
+  // ReadSymbol() calls for which the |symbol_count| is only known at runtime
+  // will use this variant.
+  int ReadSymbol(uint16_t* cdf, int symbol_count);
+  // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
+  // symbols) will use this variant.
+  bool ReadSymbol(uint16_t* cdf);
+  bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
+  // Use either linear search or binary search for decoding the symbol depending
+  // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
+  // at compile time will use this variant.
+  template <int symbol_count>
+  int ReadSymbol(uint16_t* cdf);
+
+ private:
+  static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
+  static_assert(kWindowSize >= 32, "");
+
+  // Reads a symbol using the |cdf| table which contains the probabilities of
+  // each symbol. On a high level, this function does the following:
+  //   1) Scale the |cdf| values.
+  //   2) Find the index in the |cdf| array where the scaled CDF value crosses
+  //   the modified |window_diff_| threshold.
+  //   3) That index is the symbol that has been decoded.
+  //   4) Update |window_diff_| and |values_in_range_| based on the symbol that
+  //   has been decoded.
+  inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count);
+  // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in
+  // the comment above. As of now, this function is called when |symbol_count|
+  // is greater than or equal to 14.
+  inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
+  // Specialized implementation of ReadSymbolImpl based on the fact that
+  // symbol_count == 2.
+  inline int ReadSymbolImpl(uint16_t cdf);
+  // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
+  // ReadSymbolImplN is a specialization of ReadSymbolImpl for
+  // symbol_count == N.
+  LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
+  inline void PopulateBits();
+  // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also
+  // calls PopulateBits() if necessary.
+  inline void NormalizeRange();
+
+  const uint8_t* data_;
+  const uint8_t* const data_end_;
+  // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+  // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+  // constructor, not PopulateBits().
+  const uint8_t* const data_memcpy_end_;
+  const bool allow_update_cdf_;
+  // Number of cached bits of data in the current value.
+  int bits_;
+  // Number of values in the current range. Declared as uint32_t for better
+  // performance but only the lower 16 bits are used.
+  uint32_t values_in_range_;
+  // The difference between the high end of the current range and the coded
+  // value minus 1. The 16 bits above |bits_| of this variable are used to
+  // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+  // Note this implementation differs from the spec as it trades the need to
+  // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+  // which occurs less frequently.
+  WindowSize window_diff_;
+};
+
+extern template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
+extern template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_
diff --git a/src/utils/entropy_decoder_test.cc b/src/utils/entropy_decoder_test.cc
new file mode 100644
index 0000000..9d23088
--- /dev/null
+++ b/src/utils/entropy_decoder_test.cc
@@ -0,0 +1,1259 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cstdint>
+#include <cstdio>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+#include "src/utils/entropy_decoder_test_data.inc"
+
+class EntropyDecoderTest : public testing::Test {
+ protected:
+  // If compile_time is true, tests
+  //     bool EntropyDecoder::ReadSymbol(uint16_t* cdf).
+  // Otherwise, tests
+  //     int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+  // with symbol_count=2.
+  template <bool compile_time>
+  void TestReadSymbolBoolean(int num_runs);
+
+  // For N = 3..16 (except 15):
+  //     template <bool compile_time>
+  //     void TestReadSymbolN(int num_runs);
+  //
+  // If compile_time is true, tests
+  //     int EntropyDecoder::ReadSymbol<N>(uint16_t* const cdf).
+  // Otherwise, tests
+  //     int EntropyDecoder::ReadSymbol(uint16_t* cdf, int symbol_count)
+  // with symbol_count=N.
+  //
+  // NOTE: symbol_count=15 is not tested because AV1 does not use it.
+  template <bool compile_time>
+  void TestReadSymbol3(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol4(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol5(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol6(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol7(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol8(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol9(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol10(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol11(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol12(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol13(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol14(int num_runs);
+
+  template <bool compile_time>
+  void TestReadSymbol16(int num_runs);
+};
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbolBoolean(int num_runs) {
+  static constexpr int kSymbols[4][4] = {{0, 0, 1, 1},  //
+                                         {0, 1, 1, 0},  //
+                                         {1, 0, 1, 0},  //
+                                         {1, 0, 0, 1}};
+  absl::Duration elapsed_time;
+  bool symbols[1024 * 4 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbolBoolean,
+                          kNumBytesTestReadSymbolBoolean,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][3] = {
+        {16384, 0, 0},
+        {32768 - 8386, 0, 0},
+        {32768 - 24312, 0, 0},
+        {16384, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 2) != 0;
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbolBooleanCompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbolBoolean(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 4; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol3(int num_runs) {
+  static constexpr int kSymbols[6][4] = {{0, 2, 1, 2},  //
+                                         {1, 1, 2, 1},  //
+                                         {2, 0, 0, 0},  //
+                                         {0, 2, 0, 2},  //
+                                         {1, 2, 1, 0},  //
+                                         {2, 1, 1, 0}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 6 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol3, kNumBytesTestReadSymbol3,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][4] = {
+        // pdf: 1/3, 1/3, 1/3
+        {32768 - 10923, 32768 - 21845, 0, 0},
+        // pdf: 1/6, 2/6, 3/6
+        {32768 - 5461, 32768 - 16384, 0, 0},
+        // pdf: 2/6, 3/6, 1/6
+        {32768 - 10923, 32768 - 27307, 0, 0},
+        // pdf: 3/6, 1/6, 2/6
+        {32768 - 16384, 32768 - 21845, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 6; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<3>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 3);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol3CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol3(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 6; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol4(int num_runs) {
+  static constexpr int kSymbols[8][4] = {{0, 0, 3, 3},  //
+                                         {0, 0, 2, 2},  //
+                                         {1, 1, 0, 0},  //
+                                         {1, 2, 1, 1},  //
+                                         {2, 2, 3, 2},  //
+                                         {2, 3, 2, 1},  //
+                                         {3, 3, 0, 0},  //
+                                         {3, 3, 1, 1}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 8 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol4, kNumBytesTestReadSymbol4,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][5] = {
+        // pdf: 1/4, 1/4, 1/4, 1/4
+        {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+        // pdf: 2/8, 1/8, 2/8, 3/8
+        {32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0},
+        // pdf: 1/4, 1/4, 1/4, 1/4
+        {32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0},
+        // pdf: 2/8, 3/8, 2/8, 1/8
+        {32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 8; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<4>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 4);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol4CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol4(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 8; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol5(int num_runs) {
+  static constexpr int kSymbols[10][4] = {{0, 0, 4, 4},  //
+                                          {0, 1, 3, 3},  //
+                                          {1, 2, 2, 2},  //
+                                          {1, 3, 1, 1},  //
+                                          {2, 4, 0, 0},  //
+                                          {2, 0, 4, 3},  //
+                                          {3, 1, 3, 2},  //
+                                          {3, 2, 2, 1},  //
+                                          {4, 3, 1, 2},  //
+                                          {4, 0, 4, 2}};
+  absl::Duration elapsed_time;
+  int symbols[320 * 10 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol5, kNumBytesTestReadSymbol5,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][6] = {
+        // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+        {32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0},
+        // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+        {32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0},
+        // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+        {32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0},
+        // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+        {32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 320; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<5>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 5);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol5CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol5(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 320; ++i) {
+    for (int j = 0; j < 10; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol6(int num_runs) {
+  static constexpr int kSymbols[12][4] = {{0, 0, 5, 5},  //
+                                          {0, 1, 4, 4},  //
+                                          {1, 2, 3, 3},  //
+                                          {1, 3, 2, 2},  //
+                                          {2, 4, 1, 1},  //
+                                          {2, 5, 0, 0},  //
+                                          {3, 0, 5, 4},  //
+                                          {3, 1, 4, 3},  //
+                                          {4, 2, 3, 2},  //
+                                          {4, 3, 2, 1},  //
+                                          {5, 4, 1, 3},  //
+                                          {5, 0, 5, 2}};
+  absl::Duration elapsed_time;
+  int symbols[256 * 12 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol6, kNumBytesTestReadSymbol6,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][7] = {
+        // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+        {32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845,
+         32768 - 27307, 0, 0},
+        // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+        {32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+         32768 - 30037, 0, 0},
+        // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+        {32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115,
+         32768 - 24576, 0, 0},
+        // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+        {32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576,
+         32768 - 30037, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 256; ++i) {
+      for (int j = 0; j < 12; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<6>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 6);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol6CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol6(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 12; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol7(int num_runs) {
+  static constexpr int kSymbols[14][4] = {{0, 4, 6, 3},  //
+                                          {1, 5, 5, 2},  //
+                                          {2, 6, 4, 1},  //
+                                          {3, 0, 3, 0},  //
+                                          {4, 1, 2, 6},  //
+                                          {5, 2, 1, 5},  //
+                                          {6, 3, 0, 4},  //
+                                          {0, 0, 6, 5},  //
+                                          {2, 1, 4, 3},  //
+                                          {4, 3, 6, 1},  //
+                                          {6, 5, 2, 4},  //
+                                          {1, 0, 5, 2},  //
+                                          {3, 2, 3, 2},  //
+                                          {5, 4, 5, 3}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 14 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol7, kNumBytesTestReadSymbol7,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][8] = {
+        // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+        {32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+         32768 - 23406, 32768 - 28087, 0, 0},
+        // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+        {32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+         32768 - 25746, 32768 - 30427, 0, 0},
+        // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+        {32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+         32768 - 25746, 0, 0},
+        // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+        {32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+         32768 - 25746, 32768 - 30427, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 14; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<7>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 7);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol7CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol7(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 14; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol8(int num_runs) {
+  static constexpr int kSymbols[16][4] = {{0, 4, 7, 3},  //
+                                          {1, 5, 6, 2},  //
+                                          {2, 6, 5, 1},  //
+                                          {3, 7, 4, 0},  //
+                                          {4, 0, 3, 7},  //
+                                          {5, 1, 2, 6},  //
+                                          {6, 2, 1, 5},  //
+                                          {7, 3, 0, 4},  //
+                                          {0, 0, 6, 5},  //
+                                          {2, 1, 4, 3},  //
+                                          {4, 3, 6, 4},  //
+                                          {6, 5, 2, 2},  //
+                                          {1, 0, 7, 3},  //
+                                          {3, 2, 5, 5},  //
+                                          {5, 4, 7, 2},  //
+                                          {7, 6, 3, 4}};
+  absl::Duration elapsed_time;
+  int symbols[1024 * 16 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol8, kNumBytesTestReadSymbol8,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][9] = {
+        // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+        {32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+         32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0},
+        // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+        {32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+         32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0},
+        // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+        {32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+         32768 - 20480, 32768 - 26624, 0, 0},
+        // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+        {32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+         32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 1024; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<8>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 8);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol8CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol8(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 1024; ++i) {
+    for (int j = 0; j < 16; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol9(int num_runs) {
+  static constexpr int kSymbols[18][4] = {{0, 4, 8, 3},  //
+                                          {1, 5, 7, 2},  //
+                                          {2, 6, 6, 1},  //
+                                          {3, 7, 5, 0},  //
+                                          {4, 8, 4, 8},  //
+                                          {5, 0, 3, 7},  //
+                                          {6, 1, 2, 6},  //
+                                          {7, 2, 1, 5},  //
+                                          {8, 3, 0, 4},  //
+                                          {0, 0, 8, 7},  //
+                                          {2, 1, 6, 5},  //
+                                          {4, 3, 4, 3},  //
+                                          {6, 5, 2, 1},  //
+                                          {8, 7, 7, 6},  //
+                                          {1, 0, 5, 4},  //
+                                          {3, 2, 3, 2},  //
+                                          {5, 4, 1, 4},  //
+                                          {7, 6, 8, 4}};
+  absl::Duration elapsed_time;
+  int symbols[128 * 18 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol9, kNumBytesTestReadSymbol9,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][10] = {
+        // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+        {32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564,
+         32768 - 18204, 32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0},
+        // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+        {32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+         32768 - 20025, 32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+        // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+        {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+         32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0},
+        // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+        {32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+         32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 18; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<9>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 9);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol9CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol9(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 128; ++i) {
+    for (int j = 0; j < 18; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol10(int num_runs) {
+  static constexpr int kSymbols[20][4] = {{0, 5, 9, 4},  //
+                                          {1, 6, 8, 3},  //
+                                          {2, 7, 7, 2},  //
+                                          {3, 8, 6, 1},  //
+                                          {4, 9, 5, 0},  //
+                                          {5, 0, 4, 9},  //
+                                          {6, 1, 3, 8},  //
+                                          {7, 2, 2, 7},  //
+                                          {8, 3, 1, 6},  //
+                                          {9, 4, 0, 5},  //
+                                          {0, 0, 9, 7},  //
+                                          {2, 1, 8, 5},  //
+                                          {4, 3, 6, 3},  //
+                                          {6, 5, 4, 1},  //
+                                          {8, 7, 2, 8},  //
+                                          {1, 0, 9, 6},  //
+                                          {3, 2, 7, 4},  //
+                                          {5, 4, 5, 2},  //
+                                          {7, 6, 3, 5},  //
+                                          {9, 8, 1, 4}};
+  absl::Duration elapsed_time;
+  int symbols[96 * 20 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol10, kNumBytesTestReadSymbol10,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][11] = {
+        // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+        {32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+         32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0},
+        // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+        {32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+         32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853,
+         32768 - 31130, 0, 0},
+        // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+        {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+         32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0},
+        // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+        {32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+         32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 96; ++i) {
+      for (int j = 0; j < 20; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<10>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 10);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol10CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol10(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 96; ++i) {
+    for (int j = 0; j < 20; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol11(int num_runs) {
+  static constexpr int kSymbols[22][4] = {{0, 6, 10, 5},   //
+                                          {1, 7, 9, 4},    //
+                                          {2, 8, 8, 3},    //
+                                          {3, 9, 7, 2},    //
+                                          {4, 10, 6, 1},   //
+                                          {5, 0, 5, 0},    //
+                                          {6, 1, 4, 10},   //
+                                          {7, 2, 3, 9},    //
+                                          {8, 3, 2, 8},    //
+                                          {9, 4, 1, 7},    //
+                                          {10, 5, 0, 6},   //
+                                          {0, 0, 10, 9},   //
+                                          {2, 1, 8, 7},    //
+                                          {4, 3, 6, 5},    //
+                                          {6, 5, 4, 3},    //
+                                          {8, 7, 2, 1},    //
+                                          {10, 9, 10, 8},  //
+                                          {1, 0, 9, 6},    //
+                                          {3, 2, 7, 4},    //
+                                          {5, 4, 5, 2},    //
+                                          {7, 6, 3, 5},    //
+                                          {9, 8, 1, 5}};
+  absl::Duration elapsed_time;
+  int symbols[96 * 22 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol11, kNumBytesTestReadSymbol11,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][12] = {
+        // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+        {32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+         32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+         32768 - 29789, 0, 0},
+        // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+        {32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+         32768 - 28300, 32768 - 31279, 0, 0},
+        // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+        {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+         32768 - 28300, 0, 0},
+        // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+        {32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+         32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+         32768 - 31279, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 96; ++i) {
+      for (int j = 0; j < 22; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<11>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 11);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol11CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol11(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 96; ++i) {
+    for (int j = 0; j < 22; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol12(int num_runs) {
+  static constexpr int kSymbols[24][4] = {{0, 6, 11, 5},   //
+                                          {1, 7, 10, 4},   //
+                                          {2, 8, 9, 3},    //
+                                          {3, 9, 8, 2},    //
+                                          {4, 10, 7, 1},   //
+                                          {5, 11, 6, 0},   //
+                                          {6, 0, 5, 11},   //
+                                          {7, 1, 4, 10},   //
+                                          {8, 2, 3, 9},    //
+                                          {9, 3, 2, 8},    //
+                                          {10, 4, 1, 7},   //
+                                          {11, 5, 0, 6},   //
+                                          {0, 0, 11, 9},   //
+                                          {2, 1, 10, 7},   //
+                                          {4, 3, 8, 5},    //
+                                          {6, 5, 6, 3},    //
+                                          {8, 7, 4, 1},    //
+                                          {10, 9, 2, 10},  //
+                                          {1, 0, 11, 8},   //
+                                          {3, 2, 9, 6},    //
+                                          {5, 4, 7, 4},    //
+                                          {7, 6, 5, 2},    //
+                                          {9, 8, 3, 6},    //
+                                          {11, 10, 1, 5}};
+  absl::Duration elapsed_time;
+  int symbols[80 * 24 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol12, kNumBytesTestReadSymbol12,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][13] = {
+        // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+        // 1/12,
+        // 1/12
+        {32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+         32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+         32768 - 27307, 32768 - 30037, 0, 0},
+        // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 1/24
+        {32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+         32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+         32768 - 28672, 32768 - 31403, 0, 0},
+        // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 3/24
+        {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+         32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+         32768 - 25941, 32768 - 28672, 0, 0},
+        // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24,
+        // 2/24,
+        // 1/24
+        {32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+         32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+         32768 - 28672, 32768 - 31403, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 80; ++i) {
+      for (int j = 0; j < 24; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<12>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 12);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol12CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol12(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 80; ++i) {
+    for (int j = 0; j < 24; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol13(int num_runs) {
+  static constexpr int kSymbols[26][4] = {{0, 6, 12, 5},     //
+                                          {1, 7, 11, 4},     //
+                                          {2, 8, 10, 3},     //
+                                          {3, 9, 9, 2},      //
+                                          {4, 10, 8, 1},     //
+                                          {5, 11, 7, 0},     //
+                                          {6, 12, 6, 12},    //
+                                          {7, 0, 5, 11},     //
+                                          {8, 1, 4, 10},     //
+                                          {9, 2, 3, 9},      //
+                                          {10, 3, 2, 8},     //
+                                          {11, 4, 1, 7},     //
+                                          {12, 5, 0, 6},     //
+                                          {0, 0, 12, 11},    //
+                                          {2, 1, 10, 9},     //
+                                          {4, 3, 8, 7},      //
+                                          {6, 5, 6, 5},      //
+                                          {8, 7, 4, 3},      //
+                                          {10, 9, 2, 1},     //
+                                          {12, 11, 12, 10},  //
+                                          {1, 0, 11, 8},     //
+                                          {3, 2, 9, 6},      //
+                                          {5, 4, 7, 4},      //
+                                          {7, 6, 5, 2},      //
+                                          {9, 8, 3, 6},      //
+                                          {11, 10, 1, 6}};
+  absl::Duration elapsed_time;
+  int symbols[64 * 26 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol13, kNumBytesTestReadSymbol13,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][14] = {
+        // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+        // 1/13, 1/13, 1/13
+        {32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+         32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+         32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0},
+        // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 1/26
+        {32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+         32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+         32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+        // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 3/26
+        {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+         32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+         32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0},
+        // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26,
+        // 2/26, 2/26, 1/26
+        {32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+         32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+         32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 26; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<13>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 13);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol13CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol13(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 64; ++i) {
+    for (int j = 0; j < 26; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol14(int num_runs) {
+  static constexpr int kSymbols[28][4] = {{0, 7, 13, 6},    //
+                                          {1, 8, 12, 5},    //
+                                          {2, 9, 11, 4},    //
+                                          {3, 10, 10, 3},   //
+                                          {4, 11, 9, 2},    //
+                                          {5, 12, 8, 1},    //
+                                          {6, 13, 7, 0},    //
+                                          {7, 0, 6, 13},    //
+                                          {8, 1, 5, 12},    //
+                                          {9, 2, 4, 11},    //
+                                          {10, 3, 3, 10},   //
+                                          {11, 4, 2, 9},    //
+                                          {12, 5, 1, 8},    //
+                                          {13, 6, 0, 7},    //
+                                          {0, 0, 13, 11},   //
+                                          {2, 1, 12, 9},    //
+                                          {4, 3, 10, 7},    //
+                                          {6, 5, 8, 5},     //
+                                          {8, 7, 6, 3},     //
+                                          {10, 9, 4, 1},    //
+                                          {12, 11, 2, 12},  //
+                                          {1, 0, 13, 10},   //
+                                          {3, 2, 11, 8},    //
+                                          {5, 4, 9, 6},     //
+                                          {7, 6, 7, 4},     //
+                                          {9, 8, 5, 2},     //
+                                          {11, 10, 3, 7},   //
+                                          {13, 12, 1, 6}};
+  absl::Duration elapsed_time;
+  int symbols[64 * 28 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol14, kNumBytesTestReadSymbol14,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][15] = {
+        // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+        // 1/14, 1/14, 1/14, 1/14
+        {32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+         32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+         32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0},
+        // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 1/28
+        {32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+         32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+         32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+        // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 3/28
+        {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+         32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+         32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0},
+        // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28,
+        // 2/28, 2/28, 2/28, 1/28
+        {32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+         32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+         32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 28; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<14>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 14);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol14CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol14(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 64; ++i) {
+    for (int j = 0; j < 28; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+template <bool compile_time>
+void EntropyDecoderTest::TestReadSymbol16(int num_runs) {
+  static constexpr int kSymbols[32][4] = {{0, 8, 15, 7},    //
+                                          {1, 9, 14, 6},    //
+                                          {2, 10, 13, 5},   //
+                                          {3, 11, 12, 4},   //
+                                          {4, 12, 11, 3},   //
+                                          {5, 13, 10, 2},   //
+                                          {6, 14, 9, 1},    //
+                                          {7, 15, 8, 0},    //
+                                          {8, 0, 7, 15},    //
+                                          {9, 1, 6, 14},    //
+                                          {10, 2, 5, 13},   //
+                                          {11, 3, 4, 12},   //
+                                          {12, 4, 3, 11},   //
+                                          {13, 5, 2, 10},   //
+                                          {14, 6, 1, 9},    //
+                                          {15, 7, 0, 8},    //
+                                          {0, 0, 15, 13},   //
+                                          {2, 1, 14, 11},   //
+                                          {4, 3, 12, 9},    //
+                                          {6, 5, 10, 7},    //
+                                          {8, 7, 8, 5},     //
+                                          {10, 9, 6, 3},    //
+                                          {12, 11, 4, 1},   //
+                                          {14, 13, 2, 14},  //
+                                          {1, 0, 15, 12},   //
+                                          {3, 2, 13, 10},   //
+                                          {5, 4, 11, 8},    //
+                                          {7, 6, 9, 6},     //
+                                          {9, 8, 7, 4},     //
+                                          {11, 10, 5, 2},   //
+                                          {13, 12, 3, 8},   //
+                                          {15, 14, 1, 7}};
+  absl::Duration elapsed_time;
+  int symbols[48 * 32 * 4];
+  for (int run = 0; run < num_runs; ++run) {
+    EntropyDecoder reader(kBytesTestReadSymbol16, kNumBytesTestReadSymbol16,
+                          /*allow_update_cdf=*/true);
+    uint16_t cdf[4][17] = {
+        // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+        // 1/16, 1/16, 1/16, 1/16, 1/16, 1/16
+        {32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+         32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+         32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+         32768 - 28672, 32768 - 30720, 0, 0},
+        // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+        {32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+         32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+         32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+         32768 - 29696, 32768 - 31744, 0, 0},
+        // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 3/32
+        {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+         32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+         32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+         32768 - 27648, 32768 - 29696, 0, 0},
+        // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32,
+        // 2/32, 2/32, 2/32, 2/32, 2/32, 1/32
+        {32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+         32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+         32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+         32768 - 29696, 32768 - 31744, 0, 0},
+    };
+    const absl::Time start = absl::Now();
+    int index = 0;
+    for (int i = 0; i < 48; ++i) {
+      for (int j = 0; j < 32; ++j) {
+        for (int k = 0; k < 4; ++k) {  // NOLINT(modernize-loop-convert)
+          if (compile_time) {
+            symbols[index++] = reader.ReadSymbol<16>(cdf[k]);
+          } else {
+            symbols[index++] = reader.ReadSymbol(cdf[k], 16);
+          }
+        }
+      }
+    }
+    elapsed_time += absl::Now() - start;
+  }
+  if (compile_time) {
+    printf("TestReadSymbol16CompileTime(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  } else {
+    printf("TestReadSymbol16(%d): %5d us\n", num_runs,
+           static_cast<int>(absl::ToInt64Microseconds(elapsed_time)));
+  }
+
+  int index = 0;
+  for (int i = 0; i < 48; ++i) {
+    for (int j = 0; j < 32; ++j) {  // NOLINT(modernize-loop-convert)
+      for (int k = 0; k < 4; ++k) {
+        ASSERT_EQ(symbols[index++], kSymbols[j][k]);
+      }
+    }
+  }
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBoolean) {
+  TestReadSymbolBoolean</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbolBooleanCompileTime) {
+  TestReadSymbolBoolean</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3) {
+  TestReadSymbol3</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol3CompileTime) {
+  TestReadSymbol3</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4) {
+  TestReadSymbol4</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol4CompileTime) {
+  TestReadSymbol4</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5) {
+  TestReadSymbol5</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol5CompileTime) {
+  TestReadSymbol5</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6) {
+  TestReadSymbol6</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol6CompileTime) {
+  TestReadSymbol6</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7) {
+  TestReadSymbol7</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol7CompileTime) {
+  TestReadSymbol7</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8) {
+  TestReadSymbol8</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol8CompileTime) {
+  TestReadSymbol8</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9) {
+  TestReadSymbol9</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol9CompileTime) {
+  TestReadSymbol9</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10) {
+  TestReadSymbol10</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol10CompileTime) {
+  TestReadSymbol10</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11) {
+  TestReadSymbol11</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol11CompileTime) {
+  TestReadSymbol11</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12) {
+  TestReadSymbol12</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol12CompileTime) {
+  TestReadSymbol12</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13) {
+  TestReadSymbol13</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol13CompileTime) {
+  TestReadSymbol13</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14) {
+  TestReadSymbol14</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol14CompileTime) {
+  TestReadSymbol14</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16) {
+  TestReadSymbol16</*compile_time=*/false>(1);
+}
+
+TEST_F(EntropyDecoderTest, ReadSymbol16CompileTime) {
+  TestReadSymbol16</*compile_time=*/true>(1);
+}
+
+TEST_F(EntropyDecoderTest, DISABLED_Speed) {
+  // compile_time=true is only tested for those symbol_count values that have
+  // an instantiation of the EntropyDecoder::ReadSymbol<symbol_count> template
+  // method.
+  TestReadSymbolBoolean</*compile_time=*/false>(10000);
+  TestReadSymbolBoolean</*compile_time=*/true>(10000);
+  TestReadSymbol3</*compile_time=*/false>(5000);
+  TestReadSymbol3</*compile_time=*/true>(5000);
+  TestReadSymbol4</*compile_time=*/false>(2000);
+  TestReadSymbol4</*compile_time=*/true>(2000);
+  TestReadSymbol5</*compile_time=*/false>(5000);
+  TestReadSymbol5</*compile_time=*/true>(5000);
+  TestReadSymbol6</*compile_time=*/false>(5000);
+  TestReadSymbol6</*compile_time=*/true>(5000);
+  TestReadSymbol7</*compile_time=*/false>(1000);
+  TestReadSymbol7</*compile_time=*/true>(1000);
+  TestReadSymbol8</*compile_time=*/false>(1000);
+  TestReadSymbol8</*compile_time=*/true>(1000);
+  TestReadSymbol9</*compile_time=*/false>(5000);
+  TestReadSymbol9</*compile_time=*/true>(5000);
+  TestReadSymbol10</*compile_time=*/false>(5000);
+  TestReadSymbol10</*compile_time=*/true>(5000);
+  TestReadSymbol11</*compile_time=*/false>(5000);
+  TestReadSymbol11</*compile_time=*/true>(5000);
+  TestReadSymbol12</*compile_time=*/false>(5000);
+  TestReadSymbol12</*compile_time=*/true>(5000);
+  TestReadSymbol13</*compile_time=*/false>(5000);
+  TestReadSymbol13</*compile_time=*/true>(5000);
+  TestReadSymbol14</*compile_time=*/false>(5000);
+  TestReadSymbol14</*compile_time=*/true>(5000);
+  TestReadSymbol16</*compile_time=*/false>(5000);
+  TestReadSymbol16</*compile_time=*/true>(5000);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/entropy_decoder_test_data.inc b/src/utils/entropy_decoder_test_data.inc
new file mode 100644
index 0000000..9050d5e
--- /dev/null
+++ b/src/utils/entropy_decoder_test_data.inc
@@ -0,0 +1,8443 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The kBytesTestReadSymbolBoolean[] array was encoded by using the following
+// libaom code:
+//
+// aom_cdf_prob cdf[4][3] = {
+//   { 16384, 0, 0 },
+//   { 32768 - 8386, 0, 0 },
+//   { 32768 - 24312, 0, 0 },
+//   { 16384, 0, 0 },
+// };
+// constexpr int kSymbols[4][4] = { { 0, 0, 1, 1 },  //
+//                                  { 0, 1, 1, 0 },  //
+//                                  { 1, 0, 1, 0 },  //
+//                                  { 1, 0, 0, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 4; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 2);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbolBoolean = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbolBoolean[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbolBoolean = 1880;
+constexpr uint8_t kBytesTestReadSymbolBoolean[] = {
+    0x1e, 0xfe, 0x7c, 0xa2, 0x1e, 0xfc, 0xa1, 0x17, 0xee, 0xbf, 0x07, 0x76,
+    0x2d, 0x11, 0x3a, 0xa5, 0x49, 0x65, 0xbb, 0x83, 0x89, 0x4b, 0xaa, 0x23,
+    0x29, 0x0d, 0x81, 0x9f, 0x6a, 0xf2, 0x9f, 0x7e, 0x14, 0x9a, 0x86, 0x78,
+    0x7f, 0xd5, 0x31, 0x14, 0x45, 0x8e, 0xf5, 0xc3, 0x36, 0x63, 0xcb, 0x4f,
+    0xeb, 0x81, 0x19, 0x75, 0x3c, 0xda, 0x21, 0x71, 0x1d, 0x05, 0x34, 0x7e,
+    0x43, 0xd4, 0x5b, 0xeb, 0x0a, 0x6d, 0xbe, 0xd2, 0x8f, 0xa5, 0x8f, 0xac,
+    0x3b, 0x43, 0xb6, 0x8a, 0xf9, 0x86, 0xf7, 0x1a, 0x3c, 0x4b, 0x2b, 0x4c,
+    0x4c, 0x4a, 0xff, 0xb9, 0x6f, 0x3c, 0xeb, 0xf6, 0x4c, 0xc8, 0x3c, 0x01,
+    0x5f, 0x12, 0x76, 0x4f, 0x88, 0xa0, 0xa5, 0xe7, 0x1d, 0xb3, 0x97, 0xd8,
+    0x31, 0x90, 0x8f, 0xd1, 0x46, 0xfd, 0xf7, 0xb1, 0x02, 0x0d, 0xf3, 0x9e,
+    0xbe, 0xa2, 0xfb, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x13, 0x59, 0xcd,
+    0xba, 0xe7, 0xc2, 0x7e, 0xe8, 0x77, 0xff, 0xa8, 0x0e, 0xc3, 0x7b, 0x63,
+    0x80, 0xfe, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e,
+    0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd,
+    0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33,
+    0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8,
+    0x30, 0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30,
+    0x37, 0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37,
+    0xeb, 0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb,
+    0xd3, 0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3,
+    0x3e, 0x83, 0x03, 0x7e, 0xbd, 0x33, 0xe8, 0x30, 0x37, 0xeb, 0xd3, 0x3e,
+    0x85, 0x13, 0x83, 0xe9, 0x58, 0xaf, 0xe8, 0xff, 0x03, 0xb8, 0xf5, 0x08,
+    0x63, 0x03, 0xea, 0xe9, 0x3a, 0x39, 0x6d, 0xb6, 0x32, 0xc5, 0xff, 0xf7,
+    0x19, 0x19, 0x9c, 0x29, 0x3a, 0xc5, 0x87, 0x27, 0x2d, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13,
+    0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a,
+    0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf,
+    0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1,
+    0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89,
+    0x61, 0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61,
+    0x35, 0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35,
+    0xad, 0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad,
+    0xfa, 0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa,
+    0x18, 0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18,
+    0x96, 0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xad, 0xfa, 0x18, 0x96,
+    0x13, 0x5a, 0xdf, 0xa1, 0x89, 0x61, 0x35, 0xac,
+};
+static_assert(sizeof(kBytesTestReadSymbolBoolean) ==
+                  kNumBytesTestReadSymbolBoolean,
+              "");
+
+// The kBytesTestReadSymbol3[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][4] = {
+//   // pdf: 1/3, 1/3, 1/3
+//   { 32768 - 10923, 32768 - 21845, 0, 0 },
+//   // pdf: 1/6, 2/6, 3/6
+//   { 32768 - 5461, 32768 - 16384, 0, 0 },
+//   // pdf: 2/6, 3/6, 1/6
+//   { 32768 - 10923, 32768 - 27307, 0, 0 },
+//   // pdf: 3/6, 1/6, 2/6
+//   { 32768 - 16384, 32768 - 21845, 0, 0 },
+// };
+// constexpr int kSymbols[6][4] = { { 0, 2, 1, 2 },  //
+//                                  { 1, 1, 2, 1 },  //
+//                                  { 2, 0, 0, 0 },  //
+//                                  { 0, 2, 0, 2 },  //
+//                                  { 1, 2, 1, 0 },  //
+//                                  { 2, 1, 1, 0 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 6; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 3);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol3 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol3[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol3 = 4646;
+constexpr uint8_t kBytesTestReadSymbol3[] = {
+    0x4a, 0xf9, 0x1a, 0x00, 0xef, 0x80, 0xd4, 0xcd, 0xc2, 0x55, 0x62, 0x76,
+    0x3a, 0x60, 0x4e, 0xc9, 0x17, 0x91, 0x86, 0xb0, 0xa0, 0xcb, 0xf7, 0x7e,
+    0x82, 0x1e, 0x92, 0xd9, 0xe5, 0xff, 0xaa, 0x0b, 0xa4, 0xc1, 0xfa, 0x0d,
+    0xbe, 0x4f, 0x17, 0x4a, 0xfd, 0xee, 0xb6, 0x9b, 0x57, 0x3e, 0xdb, 0x60,
+    0x19, 0xd2, 0xee, 0x35, 0x39, 0x73, 0xc9, 0x7b, 0x80, 0xc0, 0x9c, 0x9a,
+    0xe8, 0x0f, 0x8b, 0xb8, 0x99, 0x02, 0xde, 0x68, 0x97, 0xab, 0xee, 0x2c,
+    0xa0, 0xb1, 0x7b, 0x8e, 0x8a, 0x69, 0xd5, 0xcd, 0x40, 0x43, 0xa9, 0x4c,
+    0xd5, 0xac, 0x33, 0x70, 0x64, 0x35, 0xa1, 0x18, 0xde, 0x31, 0x21, 0x2b,
+    0xa1, 0xd2, 0x87, 0x63, 0x41, 0x4d, 0xd9, 0x0e, 0x17, 0xd8, 0x74, 0x19,
+    0xbc, 0x33, 0xee, 0xd9, 0x21, 0x22, 0x16, 0xbb, 0x1e, 0x14, 0x46, 0xcf,
+    0xfa, 0xee, 0xa2, 0xa0, 0xc0, 0x6b, 0xc5, 0xf0, 0xd8, 0x23, 0x6d, 0x20,
+    0xda, 0x75, 0xff, 0x72, 0x3d, 0x41, 0x51, 0x21, 0x23, 0xa0, 0xce, 0xa0,
+    0x46, 0xb0, 0x1d, 0x3d, 0xaf, 0x64, 0xf8, 0x57, 0xee, 0x81, 0x55, 0x3a,
+    0xea, 0xd3, 0x3f, 0x96, 0x52, 0x31, 0xe5, 0xb5, 0x70, 0x01, 0x5a, 0xaf,
+    0xbc, 0x69, 0x7e, 0x43, 0xdd, 0x2f, 0xe2, 0x40, 0xc7, 0x2d, 0x62, 0x8e,
+    0xf0, 0x2a, 0xc0, 0x06, 0xe7, 0xe0, 0x63, 0x6e, 0x09, 0xa0, 0x57, 0x83,
+    0x43, 0x5a, 0xe8, 0xb5, 0xc7, 0x1b, 0xf5, 0xe6, 0x3d, 0x19, 0xeb, 0xfa,
+    0xda, 0x3d, 0x06, 0x3e, 0xa8, 0x96, 0x09, 0xad, 0x1d, 0xac, 0xf6, 0xef,
+    0xc7, 0x32, 0x2f, 0x45, 0xe0, 0x4f, 0xa6, 0x9c, 0x2f, 0x66, 0x6b, 0xe3,
+    0x36, 0xcf, 0x36, 0x41, 0xcb, 0xd9, 0xb8, 0xc3, 0x48, 0xf4, 0x18, 0xfa,
+    0xa2, 0x58, 0x26, 0xb4, 0x76, 0xb3, 0xdb, 0xbf, 0x1c, 0xc8, 0xbd, 0x19,
+    0xc1, 0x3e, 0x9a, 0x71, 0x85, 0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf,
+    0x2f, 0xa0, 0xd1, 0x4b, 0x73, 0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1,
+    0xda, 0xcf, 0x74, 0x5b, 0xd3, 0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6,
+    0x15, 0x4a, 0x52, 0x09, 0x22, 0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d,
+    0xcf, 0x3d, 0xcf, 0xaa, 0x25, 0x82, 0x4f, 0x47, 0x6b, 0x3d, 0xd1, 0x6f,
+    0x4c, 0x8b, 0xd1, 0x9c, 0x13, 0xe9, 0xa7, 0x18, 0x55, 0x29, 0x48, 0x24,
+    0x89, 0xc9, 0x0c, 0xf2, 0xfa, 0x0d, 0x14, 0xb7, 0x3c, 0xf7, 0x3e, 0xa8,
+    0x96, 0x09, 0x3d, 0x1d, 0xac, 0xf7, 0x45, 0xbd, 0x32, 0x2f, 0x46, 0x70,
+    0x4f, 0xa6, 0x9c, 0x61, 0x54, 0xa5, 0x20, 0x92, 0x27, 0x24, 0x33, 0xcb,
+    0xe8, 0x34, 0x52, 0xdc, 0xf3, 0xdc, 0xfa, 0xa2, 0x58, 0x24, 0xf4, 0x76,
+    0xb3, 0xdd, 0x16, 0xf4, 0xc8, 0xbd, 0x19, 0xc1, 0x3e, 0x9a, 0x71, 0x85,
+    0x52, 0x94, 0x82, 0x48, 0x9c, 0x90, 0xcf, 0x2f, 0xa0, 0xd1, 0x4b, 0x73,
+    0xcf, 0x73, 0xea, 0x89, 0x60, 0x93, 0xd1, 0xda, 0xcf, 0x74, 0x5b, 0xd3,
+    0x22, 0xf4, 0x67, 0x04, 0xfa, 0x69, 0xc6, 0x15, 0x4a, 0x52, 0x09, 0x22,
+    0x72, 0x43, 0x3c, 0xbe, 0x83, 0x45, 0x2d, 0xcf, 0x3d, 0xcf, 0xaa, 0x25,
+    0x84, 0xaa, 0xde, 0xde, 0xba, 0x7e, 0x90, 0x92, 0xa0, 0xdc, 0xb3, 0x6c,
+    0xaf, 0xe6, 0x2f, 0xeb, 0xc5, 0x33, 0xe7, 0x77, 0xcf, 0xda, 0xe7, 0x31,
+    0x57, 0xb2, 0x8f, 0xde, 0x8f, 0x1d, 0xf4, 0xd3, 0x8c, 0xda, 0x94, 0xa4,
+    0x12, 0xcd, 0xc9, 0x32, 0x6d, 0xf7, 0x2d, 0x0c, 0x2c, 0xf9, 0xd8, 0x0b,
+    0x48, 0xf3, 0xb3, 0x2e, 0x80, 0xd7, 0x0a, 0xc4, 0x4f, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+    0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+    0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+    0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+    0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+    0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+    0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+    0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+    0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+    0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+    0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+    0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+    0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+    0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+    0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+    0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+    0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+    0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+    0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+    0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+    0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+    0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+    0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+    0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+    0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+    0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+    0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+    0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+    0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+    0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+    0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+    0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+    0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+    0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+    0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa,
+    0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e,
+    0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95,
+    0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf,
+    0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc,
+    0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e,
+    0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54,
+    0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10,
+    0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86,
+    0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e,
+    0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42,
+    0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6,
+    0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03,
+    0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0,
+    0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b,
+    0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10,
+    0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd,
+    0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97,
+    0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f,
+    0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81,
+    0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9,
+    0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b,
+    0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde,
+    0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32,
+    0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a,
+    0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6,
+    0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa,
+    0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76,
+    0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f,
+    0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0,
+    0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81,
+    0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac,
+    0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95,
+    0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f,
+    0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84,
+    0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4,
+    0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8,
+    0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a,
+    0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67,
+    0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09,
+    0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c,
+    0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef,
+    0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01,
+    0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35,
+    0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01,
+    0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8,
+    0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8,
+    0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54,
+    0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d,
+    0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0,
+    0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a,
+    0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52,
+    0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41,
+    0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b,
+    0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39,
+    0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09,
+    0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58,
+    0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d,
+    0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82,
+    0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad,
+    0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43,
+    0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7,
+    0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d,
+    0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f,
+    0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07,
+    0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65,
+    0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf,
+    0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78,
+    0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca,
+    0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9,
+    0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58,
+    0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8,
+    0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb,
+    0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d,
+    0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1,
+    0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06,
+    0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0,
+    0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56,
+    0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e,
+    0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10,
+    0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3,
+    0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60,
+    0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a,
+    0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f,
+    0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25,
+    0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3,
+    0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf,
+    0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07,
+    0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5,
+    0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04,
+    0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1,
+    0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3,
+    0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50,
+    0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5,
+    0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0,
+    0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8,
+    0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a,
+    0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04,
+    0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f,
+    0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5,
+    0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27,
+    0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60,
+    0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36,
+    0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a,
+    0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7,
+    0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c,
+    0xac, 0x25, 0x42, 0x7f, 0xa1, 0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde,
+    0x95, 0xb3, 0xd6, 0x07, 0x6d, 0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75,
+    0x8f, 0xbf, 0x03, 0x65, 0xf6, 0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe,
+    0x84, 0x07, 0xa0, 0xaf, 0x06, 0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d,
+    0xb4, 0xd5, 0x2b, 0x78, 0x19, 0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97,
+    0xd8, 0x04, 0x10, 0xca, 0xc2, 0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc,
+    0x1a, 0xa1, 0xbd, 0xe9, 0x5b, 0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0,
+    0x67, 0xe3, 0x97, 0x58, 0xfb, 0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b,
+    0x09, 0x50, 0x9f, 0xe8, 0x40, 0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5,
+    0x6c, 0xf5, 0x81, 0xdb, 0x4d, 0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63,
+    0xef, 0xc0, 0xd9, 0x7d, 0x80, 0x41, 0x0c, 0xac, 0x25, 0x42, 0x7f, 0xa1,
+    0x01, 0xe8, 0x2b, 0xc1, 0xaa, 0x1b, 0xde, 0x95, 0xb3, 0xd6, 0x07, 0x6d,
+    0x35, 0x4a, 0xde, 0x06, 0x7e, 0x39, 0x75, 0x8f, 0xbf, 0x03, 0x65, 0xf6,
+    0x01, 0x04, 0x32, 0xb0, 0x95, 0x09, 0xfe, 0x84, 0x07, 0xa0, 0xaf, 0x06,
+    0xa8, 0x6f, 0x7a, 0x56, 0xcf, 0x58, 0x1d, 0xb4, 0xd5, 0x2b, 0x78, 0x19,
+    0xf8, 0xe5, 0xd6, 0x3e, 0xfc, 0x0d, 0x97, 0xd8, 0x04, 0x10, 0xca, 0xc2,
+    0x54, 0x27, 0xfa, 0x10, 0x1e, 0x82, 0xbc, 0x1a, 0xa1, 0xbd, 0xe9, 0x5b,
+    0x3d, 0x60, 0x76, 0xd3, 0x54, 0xad, 0xe0, 0x67, 0xe3, 0x97, 0x58, 0xfb,
+    0xf0, 0x36, 0x5f, 0x60, 0x10, 0x43, 0x2b, 0x09, 0x50, 0x9f, 0xe8, 0x40,
+    0x7a, 0x0a, 0xf0, 0x6a, 0x86, 0xf7, 0xa5, 0x6c, 0xf5, 0x81, 0xdb, 0x4d,
+    0x52, 0xb7, 0x81, 0x9f, 0x8e, 0x5d, 0x63, 0xef, 0xc0, 0xd9, 0x7d, 0x80,
+    0x41, 0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol3) == kNumBytesTestReadSymbol3, "");
+
+// The kBytesTestReadSymbol4[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][5] = {
+//   // pdf: 1/4, 1/4, 1/4, 1/4
+//   { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+//   // pdf: 2/8, 1/8, 2/8, 3/8
+//   { 32768 - 8192, 32768 - 12288, 32768 - 20480, 0, 0 },
+//   // pdf: 1/4, 1/4, 1/4, 1/4
+//   { 32768 - 8192, 32768 - 16384, 32768 - 24576, 0, 0 },
+//   // pdf: 2/8, 3/8, 2/8, 1/8
+//   { 32768 - 8192, 32768 - 20480, 32768 - 28672, 0, 0 },
+// };
+// constexpr int kSymbols[8][4] = { { 0, 0, 3, 3 },  //
+//                                  { 0, 0, 2, 2 },  //
+//                                  { 1, 1, 0, 0 },  //
+//                                  { 1, 2, 1, 1 },  //
+//                                  { 2, 2, 3, 2 },  //
+//                                  { 2, 3, 2, 1 },  //
+//                                  { 3, 3, 0, 0 },  //
+//                                  { 3, 3, 1, 1 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 4);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol4 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol4[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol4 = 8055;
+constexpr uint8_t kBytesTestReadSymbol4[] = {
+    0x0f, 0x9b, 0x2a, 0xf6, 0x38, 0x26, 0xa1, 0xd1, 0x82, 0x5f, 0x34, 0xb5,
+    0xc7, 0xda, 0x9c, 0xd8, 0x8d, 0x4b, 0xbc, 0x5c, 0x0b, 0x8a, 0x7f, 0x6c,
+    0x46, 0x3f, 0xa2, 0x03, 0xee, 0x1f, 0xea, 0x25, 0xc7, 0xb7, 0xe2, 0xc9,
+    0x51, 0x0f, 0x7c, 0x0c, 0xe3, 0x7d, 0x7b, 0xe4, 0xbe, 0xde, 0x41, 0x5c,
+    0x5a, 0xcf, 0xe6, 0x12, 0x50, 0x7b, 0xcc, 0x83, 0x76, 0x61, 0x03, 0x3a,
+    0x1e, 0x1b, 0xf8, 0x9d, 0x08, 0x96, 0x98, 0x0f, 0x16, 0xac, 0x7c, 0x25,
+    0x6c, 0xd1, 0xe8, 0xd8, 0xd6, 0x1c, 0xbd, 0x48, 0xa5, 0x3f, 0xd3, 0x21,
+    0x4c, 0x4e, 0x94, 0xe3, 0xe3, 0xed, 0x30, 0x70, 0xdb, 0x2e, 0x95, 0xd5,
+    0x7f, 0xfe, 0xed, 0x0e, 0x73, 0xe3, 0x29, 0x09, 0x5f, 0xe3, 0x0e, 0xa6,
+    0xe7, 0xc6, 0x52, 0x12, 0xba, 0xdb, 0xb5, 0x63, 0xd9, 0xd8, 0xa4, 0x25,
+    0x75, 0xb7, 0x6a, 0xc7, 0xb3, 0xad, 0x88, 0x46, 0x64, 0x3a, 0x36, 0xb1,
+    0x2f, 0xb1, 0x03, 0xdb, 0x88, 0x74, 0x6d, 0x62, 0x5f, 0x62, 0x07, 0xb7,
+    0x10, 0xe8, 0xda, 0xc6, 0x1d, 0x6e, 0x8e, 0x12, 0x58, 0x6e, 0x98, 0x4c,
+    0xa1, 0x23, 0xc0, 0x9b, 0xb0, 0xdd, 0x31, 0xef, 0x64, 0xf0, 0x91, 0x37,
+    0x61, 0xba, 0x63, 0xde, 0xc9, 0xe1, 0x22, 0x6e, 0xc3, 0x74, 0xc7, 0xea,
+    0xcb, 0x70, 0xf6, 0xe2, 0x1d, 0x1b, 0x6c, 0xd5, 0x4f, 0x91, 0xc2, 0x4b,
+    0x0a, 0xeb, 0xb3, 0x0d, 0x59, 0x39, 0x13, 0x76, 0x15, 0xd7, 0x66, 0x1a,
+    0xf2, 0x72, 0x26, 0xec, 0x05, 0x3e, 0xcc, 0x31, 0x3e, 0x60, 0x4d, 0xd8,
+    0x0a, 0x7d, 0x98, 0x62, 0x7c, 0xc0, 0xcc, 0x5a, 0x24, 0xc8, 0xa6, 0xda,
+    0xe3, 0x09, 0x35, 0x70, 0x9c, 0x4c, 0x85, 0xac, 0x6f, 0x8b, 0x76, 0x30,
+    0xcc, 0x6f, 0xcb, 0x3e, 0x36, 0xd6, 0xec, 0x61, 0x98, 0xdf, 0x99, 0xa5,
+    0x7e, 0x2d, 0xd8, 0xc3, 0x31, 0xbf, 0x33, 0x4a, 0xfc, 0x5b, 0xb1, 0x86,
+    0x63, 0x7e, 0x66, 0x95, 0xf8, 0xb7, 0x63, 0x0c, 0xc6, 0xfc, 0xcd, 0x2b,
+    0xf1, 0x6e, 0xc6, 0x19, 0x8d, 0xf9, 0x9a, 0x57, 0xe2, 0xdd, 0x8c, 0x33,
+    0x1b, 0xf3, 0x34, 0xaf, 0xc5, 0xbb, 0x18, 0x66, 0x37, 0xe6, 0x69, 0x5f,
+    0x8b, 0x76, 0x30, 0xcc, 0x6f, 0xcc, 0xd2, 0xbf, 0x16, 0xec, 0x61, 0x98,
+    0xdf, 0x99, 0xa5, 0x7e, 0x2d, 0xd1, 0x27, 0xb1, 0xbf, 0x30, 0x0b, 0xfc,
+    0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7, 0x44, 0x9e, 0xc7,
+    0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82, 0x6d, 0xea, 0xda,
+    0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba, 0x24, 0xf6, 0x3a,
+    0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13, 0x6f, 0x56, 0xd6,
+    0xe8, 0x93, 0xd8, 0xe8, 0x26, 0xde, 0xad, 0xad, 0xd1, 0x27, 0xb1, 0xd0,
+    0x4d, 0xbd, 0x5b, 0x5b, 0xa2, 0x4f, 0x63, 0xa0, 0x9b, 0x7a, 0xb6, 0xb7,
+    0x44, 0x9e, 0xc7, 0x41, 0x36, 0xf5, 0x6d, 0x6e, 0x89, 0x3d, 0x8e, 0x82,
+    0x6d, 0xea, 0xda, 0xdd, 0x12, 0x7b, 0x1d, 0x04, 0xdb, 0xd5, 0xb5, 0xba,
+    0x24, 0xf6, 0x3a, 0x09, 0xb7, 0xab, 0x6b, 0x74, 0x49, 0xec, 0x74, 0x13,
+    0x6f, 0x56, 0xd6, 0xdf, 0x45, 0xaa, 0x16, 0xb7, 0xb7, 0x14, 0x09, 0xdb,
+    0x9f, 0x17, 0x97, 0xae, 0xa1, 0xbe, 0x34, 0x9d, 0x0e, 0x01, 0x9f, 0xdb,
+    0x16, 0xa9, 0x6a, 0x63, 0xf2, 0x9f, 0x5b, 0x3b, 0x0b, 0xae, 0x17, 0xd6,
+    0x4d, 0x75, 0x8f, 0xe3, 0xf0, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c,
+    0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96,
+    0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f,
+    0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80,
+    0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f,
+    0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d,
+    0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a,
+    0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53,
+    0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91,
+    0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9,
+    0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95,
+    0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3,
+    0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7,
+    0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42,
+    0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7,
+    0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3,
+    0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65,
+    0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8,
+    0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34,
+    0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3,
+    0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38,
+    0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5,
+    0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d,
+    0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08,
+    0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6,
+    0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7,
+    0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2,
+    0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59,
+    0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52,
+    0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81,
+    0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa,
+    0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29,
+    0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f,
+    0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0,
+    0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a,
+    0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e,
+    0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65,
+    0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e,
+    0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4,
+    0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8,
+    0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b,
+    0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c,
+    0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e,
+    0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01,
+    0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07,
+    0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd,
+    0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8,
+    0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca,
+    0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe,
+    0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab,
+    0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a,
+    0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d,
+    0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09,
+    0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89,
+    0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61,
+    0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1,
+    0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c,
+    0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4,
+    0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4,
+    0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3,
+    0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a,
+    0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53,
+    0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa,
+    0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94,
+    0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f,
+    0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24,
+    0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7,
+    0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd,
+    0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a,
+    0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f,
+    0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e,
+    0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb,
+    0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe,
+    0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19,
+    0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad,
+    0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49,
+    0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad,
+    0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc,
+    0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50,
+    0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87,
+    0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce,
+    0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5,
+    0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2,
+    0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a,
+    0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14,
+    0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad,
+    0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71,
+    0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1,
+    0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46,
+    0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b,
+    0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb,
+    0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab,
+    0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b,
+    0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5,
+    0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d,
+    0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0,
+    0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3,
+    0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0,
+    0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00,
+    0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40,
+    0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66,
+    0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd,
+    0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e,
+    0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf,
+    0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15,
+    0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98,
+    0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4,
+    0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38,
+    0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84,
+    0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb,
+    0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5,
+    0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32,
+    0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5,
+    0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae,
+    0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe,
+    0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a,
+    0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12,
+    0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f,
+    0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64,
+    0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3,
+    0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79,
+    0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06,
+    0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f,
+    0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b,
+    0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3,
+    0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c,
+    0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f,
+    0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f,
+    0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8,
+    0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d,
+    0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba,
+    0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15,
+    0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d,
+    0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a,
+    0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c,
+    0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6,
+    0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7,
+    0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d,
+    0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3,
+    0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8,
+    0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d,
+    0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43,
+    0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36,
+    0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a,
+    0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12,
+    0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce,
+    0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95,
+    0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09,
+    0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55,
+    0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48,
+    0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd,
+    0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06,
+    0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6,
+    0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0,
+    0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a,
+    0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73,
+    0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6,
+    0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2,
+    0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd,
+    0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0,
+    0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74,
+    0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e,
+    0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39,
+    0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c,
+    0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7,
+    0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55,
+    0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1,
+    0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e,
+    0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55,
+    0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f,
+    0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b,
+    0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48,
+    0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a,
+    0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b,
+    0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65,
+    0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3,
+    0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0,
+    0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b,
+    0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3,
+    0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e,
+    0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4,
+    0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4,
+    0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a,
+    0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25,
+    0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38,
+    0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed,
+    0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0,
+    0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9,
+    0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74,
+    0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59,
+    0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6,
+    0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd,
+    0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c,
+    0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e,
+    0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d,
+    0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3,
+    0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82,
+    0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39,
+    0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71,
+    0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8,
+    0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96,
+    0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54,
+    0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0,
+    0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a,
+    0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a,
+    0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f,
+    0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30,
+    0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e,
+    0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf,
+    0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59,
+    0x50, 0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb,
+    0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d,
+    0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e,
+    0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86,
+    0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f,
+    0x05, 0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03,
+    0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00,
+    0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01,
+    0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33,
+    0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e,
+    0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72,
+    0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff,
+    0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa,
+    0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2,
+    0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23,
+    0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2,
+    0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22,
+    0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8,
+    0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28,
+    0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93,
+    0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad,
+    0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75,
+    0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0,
+    0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6,
+    0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94,
+    0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e,
+    0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25,
+    0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b,
+    0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9,
+    0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31,
+    0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff,
+    0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e,
+    0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f,
+    0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3,
+    0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a,
+    0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f,
+    0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46,
+    0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb,
+    0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2,
+    0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab,
+    0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef,
+    0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4,
+    0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61,
+    0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33,
+    0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d,
+    0x44, 0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec,
+    0xb3, 0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a,
+    0xa5, 0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45,
+    0x02, 0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b,
+    0x55, 0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c,
+    0x52, 0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4,
+    0x7f, 0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1,
+    0x81, 0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92,
+    0xf5, 0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76,
+    0xfc, 0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa,
+    0xca, 0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e,
+    0xdc, 0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9,
+    0x69, 0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43,
+    0x70, 0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec,
+    0x37, 0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30,
+    0xf8, 0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0,
+    0x1d, 0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80,
+    0x03, 0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50,
+    0x0e, 0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99,
+    0x9b, 0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33,
+    0x71, 0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13,
+    0x95, 0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef,
+    0xfc, 0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05,
+    0x57, 0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6,
+    0x15, 0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71,
+    0x1b, 0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce,
+    0x12, 0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61,
+    0x12, 0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e,
+    0xc2, 0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9,
+    0x42, 0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c,
+    0x99, 0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5,
+    0x68, 0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab,
+    0xa8, 0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f,
+    0x86, 0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e,
+    0xb4, 0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44,
+    0xa7, 0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53,
+    0xf5, 0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59,
+    0x29, 0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c,
+    0xde, 0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e,
+    0x49, 0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01,
+    0x8e, 0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf,
+    0xfb, 0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a,
+    0xf4, 0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4,
+    0xfe, 0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7,
+    0x1d, 0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23,
+    0xd6, 0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3,
+    0xfd, 0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a,
+    0x33, 0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7,
+    0x5b, 0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e,
+    0x93, 0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85,
+    0x5b, 0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf,
+    0x78, 0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6,
+    0xa0, 0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb,
+    0x0e, 0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1,
+    0x9c, 0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69,
+    0xea, 0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67,
+    0x65, 0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70,
+    0xd5, 0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a,
+    0x28, 0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b,
+    0x5a, 0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10,
+    0xe2, 0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd,
+    0xa3, 0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e,
+    0x8c, 0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44,
+    0x97, 0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3,
+    0xb7, 0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5,
+    0x56, 0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02,
+    0x76, 0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55,
+    0x4b, 0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52,
+    0x1b, 0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f,
+    0x61, 0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81,
+    0x87, 0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5,
+    0x80, 0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc,
+    0x00, 0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca,
+    0x80, 0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc,
+    0xcc, 0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69,
+    0x9b, 0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70,
+    0x9c, 0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37,
+    0x7f, 0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8,
+    0x2a, 0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d,
+    0x30, 0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03,
+    0x88, 0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e,
+    0x70, 0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b,
+    0x08, 0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71,
+    0xf6, 0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95,
+    0x4a, 0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc,
+    0x64, 0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57,
+    0xab, 0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15,
+    0x5d, 0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b,
+    0xfc, 0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12,
+    0xf5, 0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12,
+    0x25, 0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2,
+    0x9f, 0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42,
+    0xc9, 0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99,
+    0x66, 0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68,
+    0xf2, 0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8,
+    0x0c, 0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86,
+    0xff, 0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4,
+    0xd7, 0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7,
+    0xa7, 0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5,
+    0x38, 0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29,
+    0x1e, 0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde,
+    0x9f, 0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49,
+    0x51, 0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e,
+    0x3a, 0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb,
+    0x74, 0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4,
+    0x2a, 0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe,
+    0x7b, 0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d,
+    0x35, 0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6,
+    0x58, 0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd,
+    0x8c, 0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33,
+    0x4f, 0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b,
+    0x3b, 0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93,
+    0x86, 0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b,
+    0x51, 0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78,
+    0xda, 0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0,
+    0x87, 0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e,
+    0x6d, 0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c,
+    0x74, 0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea,
+    0x24, 0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65,
+    0x9d, 0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5,
+    0x2a, 0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28,
+    0x13, 0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a,
+    0xaa, 0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2,
+    0x90, 0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3,
+    0xfb, 0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c,
+    0x0c, 0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97,
+    0xac, 0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7,
+    0xe0, 0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56,
+    0x54, 0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76,
+    0xe6, 0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b,
+    0x4c, 0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b,
+    0x84, 0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61,
+    0xbb, 0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87,
+    0xc1, 0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80,
+    0xe9, 0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00,
+    0x1c, 0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80,
+    0x73, 0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc,
+    0xd8, 0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b,
+    0x8f, 0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c,
+    0xaa, 0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f,
+    0xe3, 0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a,
+    0xbd, 0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30,
+    0xaa, 0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88,
+    0xdf, 0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70,
+    0x97, 0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08,
+    0x91, 0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6,
+    0x14, 0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a,
+    0x16, 0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64,
+    0xcb, 0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab,
+    0x47, 0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d,
+    0x40, 0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc,
+    0x37, 0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5,
+    0xa6, 0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25,
+    0x3d, 0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f,
+    0xa9, 0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9,
+    0x48, 0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66,
+    0xf4, 0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2,
+    0x4a, 0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c,
+    0x71, 0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff,
+    0xdb, 0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7,
+    0xa1, 0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7,
+    0xf3, 0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38,
+    0xe9, 0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e,
+    0xb2, 0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f,
+    0xec, 0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51,
+    0x9a, 0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a,
+    0xd9, 0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74,
+    0x9c, 0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a,
+    0xda, 0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b,
+    0xc6, 0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35,
+    0x04, 0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58,
+    0x73, 0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c,
+    0xe3, 0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f,
+    0x51, 0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b,
+    0x2c, 0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86,
+    0xa9, 0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51,
+    0x40, 0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda,
+    0xd5, 0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87,
+    0x14, 0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d,
+    0x1f, 0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74,
+    0x60, 0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24,
+    0xbd, 0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d,
+    0xbf, 0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a,
+    0xb2, 0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13,
+    0xb7, 0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa,
+    0x5a, 0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90,
+    0xdc, 0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb,
+    0x0d, 0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c,
+    0x3e, 0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac,
+    0x07, 0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0,
+    0x00, 0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54,
+    0x03, 0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6,
+    0x66, 0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c,
+    0xdc, 0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84,
+    0xe5, 0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb,
+    0xff, 0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1,
+    0x55, 0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9,
+    0x85, 0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c,
+    0x46, 0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73,
+    0x84, 0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8,
+    0x44, 0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f,
+    0xb0, 0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa,
+    0x50, 0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3,
+    0x26, 0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd,
+    0x5a, 0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa,
+    0xea, 0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf,
+    0xe1, 0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97,
+    0xad, 0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91,
+    0x29, 0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14,
+    0xfd, 0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16,
+    0x4a, 0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb,
+    0x37, 0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47,
+    0x92, 0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40,
+    0x63, 0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37,
+    0xfe, 0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6,
+    0xbd, 0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d,
+    0x3f, 0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9,
+    0xc7, 0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48,
+    0xf5, 0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4,
+    0xff, 0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a,
+    0x8c, 0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71,
+    0xd6, 0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb,
+    0xa4, 0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1,
+    0x56, 0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3,
+    0xde, 0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9,
+    0xa8, 0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2,
+    0xc3, 0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec,
+    0x67, 0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a,
+    0x7a, 0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9,
+    0xd9, 0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c,
+    0x35, 0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda,
+    0x8a, 0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6,
+    0xd6, 0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04,
+    0x38, 0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73,
+    0x68, 0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3,
+    0xa3, 0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51,
+    0x25, 0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c,
+    0xed, 0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9,
+    0x55, 0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40,
+    0x9d, 0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5,
+    0x52, 0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14,
+    0x86, 0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f,
+    0xd8, 0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60,
+    0x61, 0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd,
+    0x60, 0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf,
+    0x00, 0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2,
+    0xa0, 0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7,
+    0x33, 0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a,
+    0x66, 0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc,
+    0x27, 0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d,
+    0xdf, 0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e,
+    0x0a, 0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07,
+    0x4c, 0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00,
+    0xe2, 0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03,
+    0x9c, 0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66,
+    0xc2, 0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc,
+    0x7d, 0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5,
+    0x52, 0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff,
+    0x19, 0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55,
+    0xea, 0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85,
+    0x57, 0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46,
+    0xff, 0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84,
+    0xbd, 0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44,
+    0x89, 0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0,
+    0xa7, 0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50,
+    0xb2, 0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26,
+    0x59, 0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a,
+    0x3c, 0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea,
+    0x03, 0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1,
+    0xbf, 0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad,
+    0x35, 0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29,
+    0xe9, 0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd,
+    0x4e, 0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a,
+    0x47, 0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37,
+    0xa7, 0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92,
+    0x54, 0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63,
+    0x8e, 0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe,
+    0xdd, 0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd,
+    0x0a, 0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f,
+    0x9e, 0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7,
+    0x4d, 0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5,
+    0x96, 0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff,
+    0x63, 0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c,
+    0xd3, 0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6,
+    0xce, 0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4,
+    0xe1, 0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56,
+    0xd4, 0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde,
+    0x36, 0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8,
+    0x21, 0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3,
+    0x9b, 0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67,
+    0x1d, 0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a,
+    0x89, 0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9,
+    0x67, 0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35,
+    0x4a, 0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a,
+    0x04, 0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6,
+    0xaa, 0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38,
+    0xa4, 0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68,
+    0xfe, 0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3,
+    0x03, 0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25,
+    0xeb, 0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed,
+    0xf8, 0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55,
+    0x95, 0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d,
+    0xb9, 0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52,
+    0xd3, 0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86,
+    0xe1, 0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8,
+    0x6e, 0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61,
+    0xf0, 0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60,
+    0x3a, 0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00,
+    0x07, 0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0,
+    0x1c, 0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33,
+    0x36, 0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66,
+    0xe3, 0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27,
+    0x2a, 0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf,
+    0xf8, 0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a,
+    0xaf, 0x56, 0x8f, 0x24, 0xa8, 0xcd, 0x3d, 0x44, 0x97, 0xac, 0x07, 0x4c,
+    0x2a, 0xba, 0x80, 0xc7, 0x1d, 0x6c, 0xec, 0xb3, 0xb7, 0xe0, 0x00, 0xe2,
+    0x37, 0xf8, 0x6f, 0xfd, 0xba, 0x4e, 0x1a, 0xa5, 0x56, 0x54, 0x03, 0x9c,
+    0x25, 0xeb, 0x4d, 0x7a, 0x15, 0x6d, 0x45, 0x02, 0x76, 0xe6, 0x66, 0xc2,
+    0x24, 0x4a, 0x7a, 0x7f, 0x3d, 0xe3, 0x6b, 0x55, 0x4b, 0x4c, 0xdc, 0x7d,
+    0x85, 0x3f, 0x53, 0x8e, 0x9a, 0x82, 0x1c, 0x52, 0x1b, 0x84, 0xe5, 0x52,
+    0x85, 0x92, 0x91, 0xeb, 0x2c, 0x39, 0xb4, 0x7f, 0x61, 0xbb, 0xff, 0x19,
+    0x32, 0xcd, 0xe9, 0xfe, 0xc6, 0x71, 0xd1, 0x81, 0x87, 0xc1, 0x55, 0xea,
+    0xd1, 0xe4, 0x95, 0x19, 0xa7, 0xa8, 0x92, 0xf5, 0x80, 0xe9, 0x85, 0x57,
+    0x50, 0x18, 0xe3, 0xad, 0x9d, 0x96, 0x76, 0xfc, 0x00, 0x1c, 0x46, 0xff,
+    0x0d, 0xff, 0xb7, 0x49, 0xc3, 0x54, 0xaa, 0xca, 0x80, 0x73, 0x84, 0xbd,
+    0x69, 0xaf, 0x42, 0xad, 0xa8, 0xa0, 0x4e, 0xdc, 0xcc, 0xd8, 0x44, 0x89,
+    0x4f, 0x4f, 0xe7, 0xbc, 0x6d, 0x6a, 0xa9, 0x69, 0x9b, 0x8f, 0xb0, 0xa7,
+    0xea, 0x71, 0xd3, 0x50, 0x43, 0x8a, 0x43, 0x70, 0x9c, 0xaa, 0x50, 0xb2,
+    0x52, 0x3d, 0x65, 0x87, 0x36, 0x8f, 0xec, 0x37, 0x7f, 0xe3, 0x26, 0x59,
+    0xbd, 0x3f, 0xd8, 0xce, 0x3a, 0x30, 0x30, 0xf8, 0x2a, 0xbd, 0x5a, 0x3c,
+    0x92, 0xa3, 0x34, 0xf5, 0x12, 0x5e, 0xb0, 0x1d, 0x30, 0xaa, 0xea, 0x03,
+    0x1c, 0x75, 0xb3, 0xb2, 0xce, 0xdf, 0x80, 0x03, 0x88, 0xdf, 0xe1, 0xbf,
+    0xf6, 0xe9, 0x38, 0x6a, 0x95, 0x59, 0x50, 0x0e, 0x70, 0x97, 0xad, 0x35,
+    0xe8, 0x55, 0xb5, 0x14, 0x09, 0xdb, 0x99, 0x9b, 0x08, 0x91, 0x29, 0xe9,
+    0xfc, 0xf7, 0x8d, 0xad, 0x55, 0x2d, 0x33, 0x71, 0xf6, 0x14, 0xfd, 0x4e,
+    0x3a, 0x6a, 0x08, 0x71, 0x48, 0x6e, 0x13, 0x95, 0x4a, 0x16, 0x4a, 0x47,
+    0xac, 0xb0, 0xe6, 0xd1, 0xfd, 0x86, 0xef, 0xfc, 0x64, 0xcb, 0x37, 0xa7,
+    0xfb, 0x19, 0xc7, 0x46, 0x06, 0x1f, 0x05, 0x57, 0xab, 0x47, 0x92, 0x54,
+    0x66, 0x9e, 0xa2, 0x4b, 0xd6, 0x03, 0xa6, 0x15, 0x5d, 0x40, 0x63, 0x8e,
+    0xb6, 0x76, 0x59, 0xdb, 0xf0, 0x00, 0x71, 0x1b, 0xfc, 0x37, 0xfe, 0xdd,
+    0x27, 0x0d, 0x52, 0xab, 0x2a, 0x01, 0xce, 0x12, 0xf5, 0xa6, 0xbd, 0x0a,
+    0xb6, 0xa2, 0x81, 0x3b, 0x73, 0x33, 0x61, 0x12, 0x25, 0x3d, 0x3f, 0x9e,
+    0xf1, 0xb5, 0xaa, 0xa5, 0xa6, 0x6e, 0x3e, 0xc2, 0x9f, 0xa9, 0xc7, 0x4d,
+    0x41, 0x0e, 0x29, 0x0d, 0xc2, 0x72, 0xa9, 0x42, 0xc9, 0x48, 0xf5, 0x96,
+    0x1c, 0xda, 0x3f, 0xb0, 0xdd, 0xff, 0x8c, 0x99, 0x66, 0xf4, 0xff, 0x63,
+    0x38, 0xe8, 0xc0, 0xc3, 0xe0, 0xaa, 0xf5, 0x68, 0xf2, 0x4a, 0x8c, 0xd3,
+    0xd4, 0x49, 0x7a, 0xc0, 0x74, 0xc2, 0xab, 0xa8, 0x0c, 0x71, 0xd6, 0xce,
+    0xcb, 0x3b, 0x7e, 0x00, 0x0e, 0x23, 0x7f, 0x86, 0xff, 0xdb, 0xa4, 0xe1,
+    0xaa, 0x55, 0x65, 0x40, 0x39, 0xc2, 0x5e, 0xb4, 0xd7, 0xa1, 0x56, 0xd4,
+    0x50, 0x27, 0x6e, 0x66, 0x6c, 0x22, 0x44, 0xa7, 0xa7, 0xf3, 0xde, 0x36,
+    0xb5, 0x54, 0xb4, 0xcd, 0xc7, 0xd8, 0x53, 0xf5, 0x38, 0xe9, 0xa8, 0x21,
+    0xc5, 0x21, 0xb8, 0x4e, 0x55, 0x28, 0x59, 0x29, 0x1e, 0xb2, 0xc3, 0x9b,
+    0x47, 0xf6, 0x1b, 0xbf, 0xf1, 0x93, 0x2c, 0xde, 0x9f, 0xec, 0x67, 0x1d,
+    0x18, 0x18, 0x7c, 0x15, 0x5e, 0xad, 0x1e, 0x49, 0x51, 0x9a, 0x7a, 0x89,
+    0x2f, 0x58, 0x0e, 0x98, 0x55, 0x75, 0x01, 0x8e, 0x3a, 0xd9, 0xd9, 0x67,
+    0x6f, 0xc0, 0x01, 0xc4, 0x6f, 0xf0, 0xdf, 0xfb, 0x74, 0x9c, 0x35, 0x4a,
+    0xac, 0xa8, 0x07, 0x38, 0x4b, 0xd6, 0x9a, 0xf4, 0x2a, 0xda, 0x8a, 0x04,
+    0xed, 0xcc, 0xcd, 0x84, 0x48, 0x94, 0xf4, 0xfe, 0x7b, 0xc6, 0xd6, 0xaa,
+    0x96, 0x99, 0xb8, 0xfb, 0x0a, 0x7e, 0xa7, 0x1d, 0x35, 0x04, 0x38, 0xa4,
+    0x37, 0x09, 0xca, 0xa5, 0x0b, 0x25, 0x23, 0xd6, 0x58, 0x73, 0x68, 0xfe,
+    0xc3, 0x77, 0xfe, 0x32, 0x65, 0x9b, 0xd3, 0xfd, 0x8c, 0xe3, 0xa3, 0x03,
+    0x0f, 0x82, 0xab, 0xd5, 0xa3, 0xc9, 0x2a, 0x33, 0x4f, 0x51, 0x25, 0xeb,
+    0x01, 0xd3, 0x0a, 0xae, 0xa0, 0x31, 0xc7, 0x5b, 0x3b, 0x2c, 0xed, 0xf8,
+    0x00, 0x38, 0x8d, 0xfe, 0x1b, 0xff, 0x6e, 0x93, 0x86, 0xa9, 0x55, 0x95,
+    0x00, 0xe7, 0x09, 0x7a, 0xd3, 0x5e, 0x85, 0x5b, 0x51, 0x40, 0x9d, 0xb9,
+    0x99, 0xb0, 0x89, 0x12, 0x9e, 0x9f, 0xcf, 0x78, 0xda, 0xd5, 0x52, 0xd3,
+    0x37, 0x1f, 0x61, 0x4f, 0xd4, 0xe3, 0xa6, 0xa0, 0x87, 0x14, 0x86, 0xe1,
+    0x39, 0x54, 0xa1, 0x64, 0xa4, 0x7a, 0xcb, 0x0e, 0x6d, 0x1f, 0xd8, 0x6e,
+    0xff, 0xc6, 0x4c, 0xb3, 0x7a, 0x7f, 0xb1, 0x9c, 0x74, 0x60, 0x61, 0xf0,
+    0x55, 0x7a, 0xb4, 0x79, 0x25, 0x46, 0x69, 0xea, 0x24, 0xbd, 0x60, 0x3a,
+    0x61, 0x55, 0xd4, 0x06, 0x38, 0xeb, 0x67, 0x65, 0x9d, 0xbf, 0x00, 0x07,
+    0x11, 0xbf, 0xc3, 0x7f, 0xed, 0xd2, 0x70, 0xd5, 0x2a, 0xb2, 0xa0, 0x1c,
+    0xe1, 0x2f, 0x5a, 0x6b, 0xd0, 0xab, 0x6a, 0x28, 0x13, 0xb7, 0x33, 0x36,
+    0x11, 0x22, 0x53, 0xd3, 0xf9, 0xef, 0x1b, 0x5a, 0xaa, 0x5a, 0x66, 0xe3,
+    0xec, 0x29, 0xfa, 0x9c, 0x74, 0xd4, 0x10, 0xe2, 0x90, 0xdc, 0x27, 0x2a,
+    0x94, 0x2c, 0x94, 0x8f, 0x59, 0x61, 0xcd, 0xa3, 0xfb, 0x0d, 0xdf, 0xf8,
+    0xc9, 0x96, 0x6f, 0x4f, 0xf6, 0x33, 0x8e, 0x8c, 0x0c, 0x3e, 0x0a, 0xaf,
+    0x56, 0x8f, 0x24,
+};
+static_assert(sizeof(kBytesTestReadSymbol4) == kNumBytesTestReadSymbol4, "");
+
+// The kBytesTestReadSymbol5[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][6] = {
+//   // pdf: 1/5, 1/5, 1/5, 1/5, 1/5
+//   { 32768 - 6554, 32768 - 13107, 32768 - 19661, 32768 - 26214, 0, 0 },
+//   // pdf: 3/10, 2/10, 2/10, 2/10, 1/10
+//   { 32768 - 9830, 32768 - 16384, 32768 - 22938, 32768 - 29491, 0, 0 },
+//   // pdf: 1/10, 2/10, 2/10, 2/10, 3/10
+//   { 32768 - 3277, 32768 - 9830, 32768 - 16384, 32768 - 22938, 0, 0 },
+//   // pdf: 1/10, 2/10, 4/10, 2/10, 1/10
+//   { 32768 - 3277, 32768 - 9830, 32768 - 22938, 32768 - 29491, 0, 0 },
+// };
+// constexpr int kSymbols[10][4] = { { 0, 0, 4, 4 },  //
+//                                   { 0, 1, 3, 3 },  //
+//                                   { 1, 2, 2, 2 },  //
+//                                   { 1, 3, 1, 1 },  //
+//                                   { 2, 4, 0, 0 },  //
+//                                   { 2, 0, 4, 3 },  //
+//                                   { 3, 1, 3, 2 },  //
+//                                   { 3, 2, 2, 1 },  //
+//                                   { 4, 3, 1, 2 },  //
+//                                   { 4, 0, 4, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 320; ++i) {
+//   for (int j = 0; j < 10; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 5);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol5 = 3612;
+constexpr uint8_t kBytesTestReadSymbol5[] = {
+    0x0f, 0x1c, 0x16, 0x78, 0x6f, 0x83, 0xfe, 0x29, 0x95, 0x9a, 0x42, 0xcc,
+    0x70, 0x9a, 0x0d, 0x72, 0xe0, 0x7d, 0x63, 0x9e, 0x05, 0x3c, 0x88, 0x22,
+    0x40, 0x57, 0x83, 0xa8, 0x69, 0x6f, 0xc3, 0xb2, 0x58, 0x6c, 0xa9, 0x41,
+    0x3c, 0x2f, 0x3f, 0xa3, 0xe6, 0x4e, 0x5e, 0xaf, 0x42, 0x56, 0x9d, 0x3f,
+    0x70, 0xeb, 0x00, 0x02, 0x86, 0x23, 0x5f, 0x8e, 0x1b, 0x35, 0x71, 0x7d,
+    0x50, 0xbe, 0xb1, 0x1e, 0xe9, 0x2f, 0x08, 0x5a, 0x04, 0xc0, 0x7b, 0x98,
+    0x20, 0xbd, 0xc5, 0x39, 0xf7, 0x93, 0x5c, 0x6c, 0x4a, 0x0f, 0x50, 0x24,
+    0xe1, 0xf3, 0x2a, 0x8d, 0x53, 0x55, 0x9a, 0xd6, 0x3a, 0xd3, 0xd6, 0x9c,
+    0x41, 0xa2, 0x2c, 0x05, 0x1c, 0x5a, 0x28, 0x8d, 0xc0, 0x4f, 0x8d, 0xc1,
+    0x40, 0xaa, 0x19, 0xbf, 0xa7, 0x93, 0x48, 0xdf, 0x54, 0xcf, 0xb4, 0x47,
+    0xc4, 0x39, 0x90, 0xbb, 0xff, 0xb4, 0x47, 0x65, 0x33, 0x34, 0x45, 0x23,
+    0x5e, 0x79, 0xc5, 0xbd, 0x24, 0x30, 0x58, 0x8a, 0x19, 0x68, 0xbb, 0x08,
+    0xaa, 0xff, 0xce, 0x68, 0x37, 0xb4, 0x62, 0x44, 0x31, 0xe8, 0x3e, 0x4d,
+    0x05, 0x1d, 0xe2, 0x48, 0x56, 0xd5, 0x53, 0x19, 0xcc, 0xfd, 0x82, 0xa7,
+    0x06, 0xc4, 0x66, 0x95, 0x6c, 0x43, 0x3d, 0x43, 0x86, 0xe3, 0x62, 0x51,
+    0x26, 0x1c, 0x57, 0xed, 0x9a, 0x1a, 0x14, 0x4f, 0x41, 0x96, 0xc0, 0x72,
+    0x38, 0x59, 0xff, 0x69, 0xae, 0x2b, 0x59, 0x65, 0x30, 0xfd, 0xa5, 0x6f,
+    0x1b, 0xab, 0x01, 0x72, 0xb4, 0xcd, 0xba, 0x44, 0x73, 0x12, 0x31, 0xee,
+    0x83, 0x08, 0x5c, 0x35, 0x41, 0x17, 0xf1, 0x80, 0x55, 0xdd, 0x67, 0xb2,
+    0xd3, 0xe1, 0x04, 0x51, 0x69, 0x9b, 0x4b, 0x98, 0xcf, 0x17, 0x0a, 0xd4,
+    0xdc, 0x61, 0xf2, 0xb9, 0x4b, 0x23, 0xb6, 0xe8, 0x0c, 0x0d, 0xda, 0x68,
+    0xac, 0xd9, 0xf4, 0x11, 0x63, 0x4a, 0x7f, 0x17, 0x69, 0xdb, 0x91, 0x1b,
+    0x1d, 0xfb, 0x74, 0x58, 0x69, 0xcc, 0xf5, 0xce, 0x0d, 0x1e, 0xdd, 0x6d,
+    0x2e, 0x87, 0xf2, 0x36, 0x39, 0x22, 0x59, 0x78, 0x01, 0x2c, 0xf0, 0xe6,
+    0x8c, 0xd1, 0xdb, 0xa4, 0xf4, 0xc4, 0x09, 0x0e, 0xfe, 0x93, 0x88, 0x90,
+    0x3e, 0x55, 0x60, 0x51, 0x6a, 0xe9, 0x26, 0x41, 0x1f, 0x18, 0xab, 0xc1,
+    0xa4, 0x66, 0x57, 0xdd, 0xe6, 0x88, 0xbd, 0x74, 0xa0, 0xd3, 0x65, 0x0d,
+    0x04, 0xe3, 0x97, 0x1e, 0x9b, 0x59, 0xfc, 0xe2, 0x45, 0x9b, 0x90, 0xe1,
+    0x80, 0x20, 0x85, 0x03, 0x06, 0x1f, 0x46, 0xb1, 0x69, 0xb4, 0xf3, 0x06,
+    0xa8, 0xb5, 0x78, 0x2c, 0x21, 0xd1, 0x67, 0x8d, 0x91, 0xef, 0x6f, 0xec,
+    0xed, 0x2c, 0xd7, 0x40, 0x32, 0x09, 0xed, 0x4e, 0x92, 0xbb, 0x28, 0x67,
+    0xac, 0x09, 0x50, 0x7f, 0x30, 0xed, 0xde, 0x56, 0xeb, 0xc9, 0x23, 0x2f,
+    0x13, 0x07, 0xef, 0x80, 0x9e, 0x83, 0x6a, 0x24, 0xd4, 0xd1, 0x84, 0xbe,
+    0xf8, 0x1f, 0xb0, 0xaa, 0x6a, 0xf0, 0xda, 0x02, 0x0c, 0x94, 0xc9, 0xbc,
+    0x0f, 0xe8, 0x76, 0x95, 0x79, 0x0e, 0x24, 0x1e, 0x4c, 0xdb, 0xe5, 0xd5,
+    0x20, 0xee, 0x13, 0xff, 0xba, 0x1f, 0x7f, 0x67, 0x89, 0x4b, 0x6b, 0x28,
+    0x33, 0x61, 0xfb, 0x53, 0xed, 0xf7, 0x13, 0x3f, 0x64, 0xc9, 0x26, 0x19,
+    0xde, 0xe6, 0xec, 0x74, 0xe0, 0x0e, 0x7b, 0x07, 0xeb, 0xd9, 0xac, 0x7e,
+    0x1d, 0xac, 0xba, 0xa0, 0x50, 0xc4, 0x12, 0xee, 0x58, 0xe5, 0xe9, 0x7c,
+    0xa3, 0x40, 0xbd, 0x92, 0x6d, 0xa8, 0x08, 0x3c, 0x9e, 0xdb, 0xd3, 0x08,
+    0x3d, 0xb3, 0x1c, 0x25, 0x09, 0x51, 0x55, 0xbb, 0x51, 0xc8, 0xe6, 0xd6,
+    0x30, 0x86, 0x25, 0xa9, 0x01, 0xed, 0x55, 0x11, 0xa4, 0x5e, 0x3f, 0x57,
+    0xb7, 0x9b, 0x64, 0xec, 0x3d, 0x93, 0x28, 0x34, 0xea, 0xe9, 0x53, 0xec,
+    0x71, 0x7c, 0x1c, 0xee, 0x03, 0x26, 0x1a, 0x15, 0x9f, 0x6c, 0x74, 0xa5,
+    0xe1, 0x04, 0x76, 0xcb, 0x0b, 0xf9, 0x96, 0x4f, 0x4e, 0xb6, 0x7e, 0xad,
+    0xc5, 0x4b, 0x37, 0x44, 0x91, 0xfd, 0x1d, 0x69, 0x11, 0x17, 0x82, 0xc4,
+    0x17, 0x39, 0x29, 0x99, 0x8f, 0xe1, 0x35, 0x4d, 0x9e, 0x4f, 0xc9, 0x98,
+    0x71, 0x6b, 0xa9, 0x0d, 0x0a, 0xf8, 0xb6, 0x3a, 0x52, 0xf0, 0x82, 0x3b,
+    0x65, 0x79, 0x60, 0x16, 0xa5, 0xa4, 0xf8, 0x0e, 0xc2, 0x3e, 0xf3, 0x23,
+    0x82, 0x4d, 0x1f, 0x9d, 0x7b, 0xe1, 0xb8, 0xd3, 0x79, 0xc4, 0x04, 0x1d,
+    0xfc, 0xbc, 0xdb, 0x37, 0x73, 0x27, 0xe3, 0x8d, 0x65, 0xcb, 0x72, 0xd2,
+    0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x13, 0x0d, 0x80, 0xf6, 0xaa, 0x90,
+    0xd2, 0x30, 0x87, 0x1b, 0xdb, 0xcd, 0xb9, 0xea, 0x28, 0xfa, 0x10, 0xd5,
+    0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4,
+    0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d,
+    0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43,
+    0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9,
+    0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc,
+    0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75,
+    0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2,
+    0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a,
+    0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5,
+    0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34,
+    0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba,
+    0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8,
+    0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d,
+    0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11,
+    0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d,
+    0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f,
+    0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23,
+    0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84,
+    0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d,
+    0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d,
+    0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17,
+    0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f,
+    0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4,
+    0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e,
+    0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3,
+    0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b,
+    0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda,
+    0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72,
+    0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21,
+    0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10,
+    0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2,
+    0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2,
+    0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18,
+    0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79,
+    0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93,
+    0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1,
+    0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6,
+    0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad,
+    0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30, 0x68,
+    0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c,
+    0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39,
+    0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d,
+    0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7,
+    0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92,
+    0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1,
+    0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c,
+    0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f,
+    0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91,
+    0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87,
+    0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29,
+    0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f,
+    0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f,
+    0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a,
+    0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06,
+    0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02,
+    0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53,
+    0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96,
+    0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb,
+    0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99,
+    0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa,
+    0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15,
+    0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57,
+    0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9,
+    0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8,
+    0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62,
+    0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6,
+    0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6,
+    0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec, 0xc9,
+    0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee, 0x30,
+    0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba, 0xe0,
+    0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79, 0x25,
+    0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5, 0x69,
+    0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72, 0xbc,
+    0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a, 0x59,
+    0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd, 0x2a,
+    0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4, 0x21,
+    0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96, 0x95,
+    0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08, 0xec,
+    0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86, 0xaf,
+    0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17, 0xa6,
+    0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91, 0xea,
+    0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2, 0x1c,
+    0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce, 0xcc,
+    0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e, 0xe3,
+    0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b, 0xae,
+    0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7, 0x92,
+    0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a, 0x56,
+    0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47, 0x2b,
+    0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61, 0xa5,
+    0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd, 0xd2,
+    0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d, 0x42,
+    0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9, 0x69,
+    0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90, 0x8e,
+    0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08, 0x6a,
+    0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1, 0x7a,
+    0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9, 0x1e,
+    0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c, 0x21,
+    0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c, 0xec,
+    0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49, 0xee,
+    0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78, 0xba,
+    0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b, 0x79,
+    0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6, 0xa5,
+    0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34, 0x72,
+    0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16, 0x1a,
+    0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c, 0xdd,
+    0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6, 0xd4,
+    0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b, 0x96,
+    0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9, 0x08,
+    0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50, 0x86,
+    0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae, 0x17,
+    0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf, 0x91,
+    0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48, 0xc2,
+    0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3, 0xce,
+    0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14, 0x9e,
+    0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37, 0x8b,
+    0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37, 0xb7,
+    0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d, 0x6a,
+    0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83, 0x47,
+    0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01, 0x61,
+    0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29, 0xcd,
+    0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b, 0x6d,
+    0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5, 0xb9,
+    0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc, 0x90,
+    0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55, 0x08,
+    0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a, 0xe1,
+    0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab, 0xf9,
+    0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64, 0x8c,
+    0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c, 0x3c,
+    0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31, 0x49,
+    0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53, 0x78,
+    0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3, 0x7b,
+    0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64, 0xd6,
+    0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18, 0x34,
+    0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70, 0x16,
+    0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92, 0x9c,
+    0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4, 0xb6,
+    0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e, 0x5b,
+    0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c, 0xc9,
+    0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95, 0x50,
+    0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10, 0xae,
+    0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a, 0xbf,
+    0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76, 0x48,
+    0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57, 0xc3,
+    0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3, 0x14,
+    0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5, 0x37,
+    0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e, 0x37,
+    0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66, 0x4d,
+    0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71, 0x83,
+    0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7, 0x01,
+    0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9, 0x29,
+    0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b, 0x4b,
+    0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95, 0xe5,
+    0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2, 0xcc,
+    0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9, 0x55,
+    0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1, 0x0a,
+    0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4, 0xab,
+    0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47, 0x64,
+    0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35, 0x7c,
+    0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd, 0x31,
+    0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f, 0x53,
+    0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10, 0xe3,
+    0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76, 0x64,
+    0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7, 0x18,
+    0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d, 0x70,
+    0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc, 0x92,
+    0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52, 0xb4,
+    0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39, 0x5e,
+    0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d, 0x2c,
+    0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e, 0x95,
+    0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a, 0x10,
+    0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb, 0x4a,
+    0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84, 0x76,
+    0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43, 0x57,
+    0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b, 0xd3,
+    0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8, 0xf5,
+    0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61, 0x0e,
+    0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7, 0x66,
+    0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f, 0x71,
+    0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5, 0xd7,
+    0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb, 0xc9,
+    0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5, 0x2b,
+    0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3, 0x95,
+    0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0, 0xd2,
+    0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6, 0xe9,
+    0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6, 0xa1,
+    0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc, 0xb4,
+    0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48, 0x47,
+    0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84, 0x35,
+    0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70, 0xbd,
+    0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc, 0x8f,
+    0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46, 0x10,
+    0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e, 0x76,
+    0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4, 0xf7,
+    0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc, 0x5d,
+    0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd, 0xbc,
+    0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b, 0x52,
+    0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a, 0x39,
+    0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b, 0x0d,
+    0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e, 0x6e,
+    0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b, 0x6a,
+    0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d, 0xcb,
+    0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64, 0x84,
+    0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8, 0x43,
+    0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57, 0x0b,
+    0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f, 0xc8,
+    0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24, 0x61,
+    0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1, 0xe7,
+    0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a, 0x4f,
+    0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b, 0xc5,
+    0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b, 0xdb,
+    0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26, 0xb5,
+    0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1, 0xa3,
+    0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80, 0xb0,
+    0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94, 0xe6,
+    0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5, 0xb6,
+    0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2, 0xdc,
+    0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66, 0x48,
+    0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x74, 0xaa, 0x84,
+    0x35, 0x7c, 0x3c, 0xec, 0xc9, 0xad, 0x4a, 0xd2, 0xdb, 0x50, 0x85, 0x70,
+    0xbd, 0x31, 0x49, 0xee, 0x30, 0x68, 0xe5, 0x79, 0x6e, 0x5a, 0x55, 0xfc,
+    0x8f, 0x53, 0x78, 0xba, 0xe0, 0x2c, 0x34, 0xb3, 0x24, 0x23, 0xb2, 0x46,
+    0x10, 0xe3, 0x7b, 0x79, 0x25, 0x39, 0xba, 0x55, 0x42, 0x1a, 0xbe, 0x1e,
+    0x76, 0x64, 0xd6, 0xa5, 0x69, 0x6d, 0xa8, 0x42, 0xb8, 0x5e, 0x98, 0xa4,
+    0xf7, 0x18, 0x34, 0x72, 0xbc, 0xb7, 0x2d, 0x2a, 0xfe, 0x47, 0xa9, 0xbc,
+    0x5d, 0x70, 0x16, 0x1a, 0x59, 0x92, 0x11, 0xd9, 0x23, 0x08, 0x71, 0xbd,
+    0xbc, 0x92, 0x9c, 0xdd, 0x2a, 0xa1, 0x0d, 0x5f, 0x0f, 0x3b, 0x32, 0x6b,
+    0x52, 0xb4, 0xb6, 0xd4, 0x21, 0x5c, 0x2f, 0x4c, 0x52, 0x7b, 0x8c, 0x1a,
+    0x39, 0x5e, 0x5b, 0x96, 0x95, 0x7f, 0x23, 0xd4, 0xde, 0x2e, 0xb8, 0x0b,
+    0x0d, 0x2c, 0xc9, 0x08, 0xec, 0x91, 0x84, 0x38, 0xde, 0xde, 0x49, 0x4e,
+    0x6e, 0x95, 0x50, 0x86, 0xaf, 0x87, 0x9d, 0x99, 0x35, 0xa9, 0x5a, 0x5b,
+    0x6a, 0x10, 0xae, 0x17, 0xa6, 0x29, 0x3d, 0xc6, 0x0d, 0x1c, 0xaf, 0x2d,
+    0xcb, 0x4a, 0xbf, 0x91, 0xea, 0x6f, 0x17, 0x5c, 0x05, 0x86, 0x96, 0x64,
+    0x84, 0x76, 0x48, 0xc2, 0x1c, 0x6f, 0x6f, 0x24, 0xa7, 0x37, 0x4a, 0xa8,
+    0x43, 0x57, 0xc3, 0xce, 0xcc, 0x9a, 0xd4, 0xad, 0x2d, 0xb5, 0x08, 0x57,
+    0x0b, 0xd3, 0x14, 0x9e, 0xe3, 0x06, 0x8e, 0x57, 0x96, 0xe5, 0xa5, 0x5f,
+    0xc8, 0xf5, 0x37, 0x8b, 0xae, 0x02, 0xc3, 0x4b, 0x32, 0x42, 0x3b, 0x24,
+    0x61, 0x0e, 0x37, 0xb7, 0x92, 0x53, 0x9b, 0xa5, 0x54, 0x21, 0xab, 0xe1,
+    0xe7, 0x66, 0x4d, 0x6a, 0x56, 0x96, 0xda, 0x84, 0x2b, 0x85, 0xe9, 0x8a,
+    0x4f, 0x71, 0x83, 0x47, 0x2b, 0xcb, 0x72, 0xd2, 0xaf, 0xe4, 0x7a, 0x9b,
+    0xc5, 0xd7, 0x01, 0x61, 0xa5, 0x99, 0x21, 0x1d, 0x92, 0x30, 0x87, 0x1b,
+    0xdb, 0xc9, 0x29, 0xcd, 0xd2, 0xaa, 0x10, 0xd5, 0xf0, 0xf3, 0xb3, 0x26,
+    0xb5, 0x2b, 0x4b, 0x6d, 0x42, 0x15, 0xc2, 0xf4, 0xc5, 0x27, 0xb8, 0xc1,
+    0xa3, 0x95, 0xe5, 0xb9, 0x69, 0x57, 0xf2, 0x3d, 0x4d, 0xe2, 0xeb, 0x80,
+    0xb0, 0xd2, 0xcc, 0x90, 0x8e, 0xc9, 0x18, 0x43, 0x8d, 0xed, 0xe4, 0x94,
+    0xe6, 0xe9, 0x55, 0x08, 0x6a, 0xf8, 0x79, 0xd9, 0x93, 0x5a, 0x95, 0xa5,
+    0xb6, 0xa1, 0x0a, 0xe1, 0x7a, 0x62, 0x93, 0xdc, 0x60, 0xd1, 0xca, 0xf2,
+    0xdc, 0xb4, 0xab, 0xf9, 0x1e, 0xa6, 0xf1, 0x75, 0xc0, 0x58, 0x69, 0x66,
+    0x48, 0x47, 0x64, 0x8c, 0x21, 0xc6, 0xf6, 0xf2, 0x4a, 0x73, 0x75, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol5) == kNumBytesTestReadSymbol5, "");
+
+// The kBytesTestReadSymbol6[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][7] = {
+//   // pmf: 1/6, 1/6, 1/6, 1/6, 1/6, 1/6
+//   { 32768 - 5461, 32768 - 10923, 32768 - 16384, 32768 - 21845, 32768 - 27307,
+//     0, 0 },
+//   // pmf: 3/12, 2/12, 2/12, 2/12, 2/12, 1/12
+//   { 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576, 32768 - 30037,
+//     0, 0 },
+//   // pmf: 1/12, 2/12, 2/12, 2/12, 2/12, 3/12
+//   { 32768 - 2731, 32768 - 8192, 32768 - 13653, 32768 - 19115, 32768 - 24576,
+//     0, 0 },
+//   // pmf: 1/12, 2/12, 3/12, 3/12, 2/12, 1/12
+//   { 32768 - 2731, 32768 - 8192, 32768 - 16384, 32768 - 24576, 32768 - 30037,
+//     0, 0 },
+// };
+// constexpr int kSymbols[12][4] = { { 0, 0, 5, 5 },  //
+//                                   { 0, 1, 4, 4 },  //
+//                                   { 1, 2, 3, 3 },  //
+//                                   { 1, 3, 2, 2 },  //
+//                                   { 2, 4, 1, 1 },  //
+//                                   { 2, 5, 0, 0 },  //
+//                                   { 3, 0, 5, 4 },  //
+//                                   { 3, 1, 4, 3 },  //
+//                                   { 4, 2, 3, 2 },  //
+//                                   { 4, 3, 2, 1 },  //
+//                                   { 5, 4, 1, 3 },  //
+//                                   { 5, 0, 5, 2 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 256; ++i) {
+//   for (int j = 0; j < 12; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 6);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol6 = 3917;
+constexpr uint8_t kBytesTestReadSymbol6[] = {
+    0x0a, 0x8e, 0xb8, 0x15, 0xd5, 0x69, 0x63, 0x06, 0x48, 0x75, 0xf4, 0x4c,
+    0xfa, 0x13, 0xba, 0x68, 0x61, 0xa6, 0x9f, 0x39, 0x63, 0xba, 0x63, 0x26,
+    0xa8, 0xaa, 0xd0, 0x10, 0x4a, 0x05, 0xaf, 0x5f, 0x65, 0x57, 0x2f, 0x68,
+    0x48, 0x2c, 0x64, 0xdf, 0x0a, 0x93, 0xcc, 0x84, 0x43, 0x97, 0x34, 0x79,
+    0x10, 0x05, 0x4d, 0x58, 0xe9, 0xc3, 0xb4, 0x4a, 0x70, 0xd4, 0x81, 0x71,
+    0x9f, 0x6b, 0x18, 0xb3, 0x72, 0xdf, 0x37, 0x87, 0x3e, 0x40, 0xd0, 0xff,
+    0x10, 0x32, 0x22, 0xe4, 0x36, 0xef, 0xa2, 0x5e, 0x39, 0x5d, 0x42, 0x59,
+    0x8c, 0x3f, 0x1b, 0x41, 0xdb, 0xc2, 0x8c, 0x64, 0xaf, 0xd2, 0x49, 0x45,
+    0xd8, 0xad, 0x85, 0x3b, 0x70, 0x13, 0x83, 0x63, 0x49, 0x86, 0x35, 0xfe,
+    0x93, 0x6b, 0x51, 0x0e, 0x32, 0x3d, 0xf0, 0x30, 0xe0, 0xf5, 0x42, 0x59,
+    0x33, 0x8e, 0x63, 0x62, 0x46, 0x00, 0x69, 0x06, 0x52, 0x83, 0x37, 0x0b,
+    0x37, 0x12, 0x38, 0x3b, 0x9c, 0xc3, 0x00, 0xed, 0x0a, 0xd4, 0xed, 0x69,
+    0x01, 0xc5, 0x3a, 0x14, 0x29, 0xaf, 0x3e, 0x9c, 0x0a, 0xaf, 0x56, 0x50,
+    0x56, 0xcd, 0xa1, 0xb0, 0x88, 0xef, 0xa7, 0x57, 0xe6, 0xe8, 0x2c, 0x42,
+    0x60, 0x55, 0x22, 0x1f, 0xcc, 0x50, 0xa9, 0xda, 0xc2, 0x73, 0x19, 0x2e,
+    0xfb, 0x74, 0x88, 0x42, 0x0d, 0x49, 0x12, 0x5e, 0x36, 0x43, 0xe7, 0x33,
+    0x00, 0x7d, 0xd5, 0x35, 0xa3, 0xaf, 0x1e, 0x93, 0x5e, 0xe6, 0xae, 0x23,
+    0x41, 0x55, 0x05, 0x19, 0xde, 0xa7, 0xf1, 0x07, 0xbd, 0x58, 0xc1, 0x10,
+    0x0a, 0x4b, 0x5c, 0xee, 0xe3, 0xfb, 0xe5, 0xf5, 0xfc, 0x1a, 0x4e, 0x51,
+    0xda, 0x3e, 0xc5, 0x36, 0xda, 0x3e, 0x83, 0xfd, 0x6b, 0x6f, 0x54, 0xdb,
+    0x68, 0x5a, 0x9c, 0x46, 0xbf, 0x86, 0x23, 0xf1, 0xbd, 0xe1, 0x79, 0x5e,
+    0xf7, 0x1c, 0xe0, 0xf7, 0xa6, 0xd5, 0x9f, 0x0b, 0x74, 0xd8, 0xf2, 0x0a,
+    0x97, 0x71, 0xa2, 0xd2, 0x37, 0x05, 0x7e, 0x3e, 0xa4, 0xec, 0x16, 0x92,
+    0x37, 0xdd, 0x45, 0x0c, 0x17, 0x42, 0xf0, 0x34, 0xf7, 0x38, 0x04, 0xdf,
+    0xb8, 0xb4, 0xd6, 0xa0, 0x2c, 0x56, 0x96, 0x10, 0x30, 0x34, 0x10, 0x39,
+    0x9e, 0x95, 0x3b, 0x13, 0xf3, 0x60, 0xa1, 0x48, 0xca, 0x9f, 0x91, 0xfe,
+    0x42, 0xfb, 0xdf, 0x37, 0xf8, 0x5d, 0x49, 0x82, 0x42, 0x4f, 0x90, 0xdf,
+    0xae, 0x32, 0x20, 0x9e, 0xb6, 0xcc, 0xa0, 0x30, 0x07, 0x15, 0x64, 0xb8,
+    0x56, 0x84, 0x1e, 0x16, 0xa3, 0x35, 0xad, 0x14, 0x9d, 0x62, 0x65, 0x0c,
+    0x77, 0x82, 0x74, 0x41, 0x9c, 0x68, 0x95, 0x03, 0x4f, 0xfc, 0x1c, 0xc7,
+    0xd6, 0xe6, 0xe7, 0xb3, 0x54, 0x66, 0x87, 0xb6, 0x41, 0x03, 0xe2, 0x20,
+    0xf7, 0xdb, 0x2a, 0x0a, 0x25, 0x20, 0x60, 0xdf, 0xfd, 0x9f, 0x5f, 0x2c,
+    0x72, 0x5f, 0x2b, 0xf4, 0x07, 0x9f, 0xf3, 0x8a, 0xde, 0xf0, 0x4f, 0x8a,
+    0xa7, 0x75, 0xe3, 0xe8, 0xc9, 0xa1, 0xa0, 0x01, 0xa1, 0x20, 0xc8, 0xfb,
+    0xf9, 0x91, 0xd2, 0x23, 0x4f, 0x6c, 0x53, 0x3b, 0x12, 0x01, 0xac, 0x1f,
+    0x89, 0x84, 0x98, 0xcd, 0x3c, 0x74, 0x51, 0x92, 0xbe, 0x87, 0x06, 0x62,
+    0x49, 0xd2, 0x1b, 0x27, 0xfa, 0x28, 0xf8, 0xbd, 0xbb, 0x7a, 0x7d, 0xde,
+    0xa2, 0x9c, 0x1b, 0x7c, 0x80, 0xe8, 0xe0, 0x43, 0x64, 0xdd, 0x22, 0x7e,
+    0x2c, 0xe4, 0x79, 0x2e, 0xbd, 0x98, 0x1a, 0x59, 0x7e, 0xbe, 0xfd, 0x9e,
+    0x0c, 0x31, 0x50, 0x10, 0xdd, 0x62, 0x3c, 0x47, 0x9a, 0x11, 0x1b, 0x48,
+    0xf3, 0xd1, 0x2c, 0x1b, 0xc2, 0xb5, 0x57, 0x7c, 0xe5, 0x97, 0x6d, 0x78,
+    0xe7, 0xa2, 0xd6, 0x57, 0x61, 0x95, 0xed, 0x8d, 0xda, 0xc6, 0xdf, 0x2c,
+    0x1d, 0x48, 0xee, 0x53, 0xd8, 0x1e, 0x80, 0x41, 0xce, 0x58, 0x08, 0x96,
+    0x6f, 0x82, 0x6e, 0x28, 0x6a, 0x5a, 0x2b, 0x4f, 0x02, 0x4d, 0x99, 0x32,
+    0xea, 0x60, 0xce, 0x75, 0x57, 0x0c, 0x63, 0xf0, 0xda, 0x51, 0x1d, 0xcc,
+    0xb8, 0x21, 0x35, 0x10, 0x56, 0xaf, 0x80, 0xb3, 0x0f, 0x17, 0x29, 0x0c,
+    0x16, 0x07, 0x66, 0xe9, 0xcb, 0x52, 0xcd, 0xec, 0xb1, 0x79, 0xf8, 0xb9,
+    0x05, 0x08, 0xa1, 0xd7, 0x03, 0x6f, 0x8e, 0x9a, 0x6e, 0xfb, 0x38, 0x3a,
+    0xff, 0xa7, 0xa1, 0xd8, 0xb1, 0x56, 0x06, 0xde, 0xb1, 0xe7, 0x47, 0xc2,
+    0xc2, 0xab, 0xa9, 0x5f, 0x01, 0x65, 0x5d, 0x4c, 0xac, 0xd8, 0x1c, 0xfd,
+    0x2d, 0x55, 0x74, 0x8a, 0x2b, 0x41, 0x2d, 0x50, 0x0c, 0x9c, 0x64, 0xb2,
+    0xed, 0xaf, 0x2a, 0xb4, 0x58, 0x93, 0xd8, 0xc2, 0xab, 0x04, 0x45, 0xfc,
+    0xd7, 0x02, 0x1e, 0x14, 0xd4, 0x38, 0xba, 0x24, 0x07, 0x9a, 0x25, 0x52,
+    0x13, 0xe1, 0xe4, 0x26, 0x66, 0x12, 0xba, 0x13, 0x11, 0x25, 0xea, 0x29,
+    0xc5, 0xff, 0x34, 0xca, 0x18, 0x34, 0x97, 0x4a, 0x92, 0x00, 0xe8, 0x61,
+    0x18, 0x85, 0x0b, 0x56, 0x83, 0x48, 0xf9, 0xdb, 0x26, 0x7b, 0x54, 0xc8,
+    0xd2, 0x63, 0x1e, 0x7b, 0x25, 0x3c, 0x4a, 0xa6, 0xda, 0x10, 0x92, 0xca,
+    0x8a, 0x2c, 0x89, 0x60, 0x8e, 0xda, 0xf2, 0xab, 0x45, 0x89, 0x3d, 0x8c,
+    0x2d, 0x35, 0xda, 0xc1, 0x7c, 0x3d, 0x05, 0x8e, 0xad, 0x5b, 0xff, 0x7d,
+    0x46, 0x7b, 0x74, 0x71, 0xec, 0x05, 0x9a, 0x85, 0xa4, 0x4f, 0xc3, 0x54,
+    0x64, 0x90, 0xe5, 0x97, 0x89, 0x1a, 0xb0, 0x56, 0x30, 0x13, 0xda, 0x44,
+    0x2c, 0xb0, 0x50, 0x0c, 0x64, 0x43, 0x4a, 0xd2, 0x2a, 0xb4, 0x8f, 0x9d,
+    0xa6, 0xe5, 0x3c, 0x0c, 0x7a, 0x44, 0xb3, 0xeb, 0xa7, 0x92, 0xe5, 0x59,
+    0xa6, 0x43, 0xe9, 0x2b, 0x1f, 0x69, 0x4a, 0xc4, 0x89, 0xe7, 0xe0, 0x04,
+    0x9f, 0x1d, 0x33, 0x61, 0xe8, 0xab, 0x75, 0x8d, 0x30, 0xd6, 0x7c, 0xca,
+    0x02, 0xbe, 0xf9, 0x1d, 0x02, 0x4e, 0x0f, 0x88, 0xc9, 0x3f, 0x54, 0x9d,
+    0x93, 0x0d, 0x44, 0xf8, 0xf6, 0xa7, 0x1a, 0xb6, 0x8b, 0xf5, 0x14, 0xca,
+    0xbd, 0x6c, 0x2d, 0x9e, 0xfa, 0x80, 0x36, 0x53, 0x06, 0xac, 0x39, 0x0f,
+    0x6b, 0xdb, 0x2e, 0xe0, 0x4f, 0xf0, 0xa4, 0x44, 0x5a, 0xbb, 0xaa, 0x72,
+    0x59, 0x3f, 0x58, 0x38, 0xe5, 0x5c, 0x76, 0x31, 0xe6, 0xfe, 0x08, 0x20,
+    0xbe, 0x3f, 0xea, 0x00, 0x0d, 0x34, 0xd9, 0x4d, 0x06, 0x0a, 0xb5, 0x04,
+    0x7b, 0x48, 0x22, 0xa9, 0x94, 0x47, 0x44, 0xfd, 0x65, 0x81, 0x45, 0x56,
+    0x91, 0xf3, 0xb4, 0xdc, 0xa7, 0x6e, 0xb1, 0xa4, 0xc5, 0xd6, 0x81, 0x6a,
+    0x78, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+    0xa7, 0xa7, 0xf2, 0x17, 0x92, 0x06, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+    0x7f, 0x99, 0x40, 0x57, 0xdf, 0x23, 0xa0, 0x49, 0xc1, 0xf1, 0x19, 0x27,
+    0xea, 0x93, 0xb2, 0x61, 0xa8, 0x9f, 0x1e, 0xd4, 0xe3, 0x56, 0xd1, 0x7e,
+    0xa2, 0x99, 0x57, 0xad, 0x85, 0xb3, 0xdf, 0x50, 0x06, 0xca, 0x60, 0xd5,
+    0x87, 0x21, 0xed, 0x7b, 0x65, 0xdc, 0x09, 0xfe, 0x14, 0x88, 0x8b, 0x57,
+    0x75, 0x4e, 0x4b, 0x27, 0xeb, 0x07, 0x1c, 0xab, 0x8e, 0xc6, 0x3c, 0xdf,
+    0xc1, 0x04, 0x17, 0xc7, 0xfd, 0x40, 0x01, 0xa6, 0x9b, 0x29, 0xa0, 0xc1,
+    0x56, 0xa0, 0x8f, 0x69, 0x04, 0x55, 0x32, 0x88, 0xe8, 0x9f, 0x8d, 0x2b,
+    0x48, 0xaa, 0xd2, 0x3e, 0x76, 0x9b, 0x94, 0xed, 0xd6, 0x34, 0x98, 0xba,
+    0x16, 0x3c, 0x29, 0xce, 0x3d, 0x14, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa,
+    0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59,
+    0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5,
+    0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46,
+    0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97,
+    0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04,
+    0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5,
+    0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a,
+    0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77,
+    0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc,
+    0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f,
+    0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4,
+    0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0,
+    0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57,
+    0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca,
+    0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a,
+    0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c,
+    0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84,
+    0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61,
+    0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4,
+    0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e,
+    0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f,
+    0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1,
+    0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c,
+    0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43,
+    0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f,
+    0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1,
+    0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda,
+    0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27,
+    0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c,
+    0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91,
+    0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3,
+    0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28,
+    0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7,
+    0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08,
+    0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c,
+    0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec,
+    0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99,
+    0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1,
+    0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23,
+    0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06,
+    0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42,
+    0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25,
+    0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa,
+    0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7,
+    0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42,
+    0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c,
+    0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8,
+    0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff,
+    0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a,
+    0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b,
+    0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe,
+    0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20,
+    0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55,
+    0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde,
+    0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02,
+    0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6,
+    0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12,
+    0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c,
+    0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c,
+    0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53,
+    0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d,
+    0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb,
+    0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0,
+    0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7,
+    0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54,
+    0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2,
+    0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb,
+    0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d,
+    0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e,
+    0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09,
+    0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb,
+    0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94,
+    0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef,
+    0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99,
+    0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f,
+    0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48,
+    0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0,
+    0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf,
+    0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94,
+    0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95,
+    0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19,
+    0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08,
+    0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2,
+    0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69,
+    0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd,
+    0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f,
+    0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2,
+    0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18,
+    0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87,
+    0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff,
+    0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42,
+    0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4,
+    0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f,
+    0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9,
+    0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22,
+    0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6,
+    0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50,
+    0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf,
+    0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10,
+    0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39,
+    0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8,
+    0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32,
+    0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63,
+    0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46,
+    0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d,
+    0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85,
+    0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a,
+    0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55,
+    0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce,
+    0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84,
+    0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19,
+    0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0,
+    0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe,
+    0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94,
+    0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37,
+    0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc,
+    0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41,
+    0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa,
+    0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd,
+    0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05,
+    0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc,
+    0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24,
+    0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78,
+    0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18,
+    0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6,
+    0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb,
+    0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96,
+    0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40,
+    0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e,
+    0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8,
+    0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64,
+    0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97,
+    0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a,
+    0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d,
+    0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12,
+    0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97,
+    0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29,
+    0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde,
+    0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32,
+    0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e,
+    0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90,
+    0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41,
+    0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e,
+    0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29,
+    0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b,
+    0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32,
+    0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10,
+    0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84,
+    0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2,
+    0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba,
+    0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e,
+    0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84,
+    0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30,
+    0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e,
+    0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff,
+    0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84,
+    0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69,
+    0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f,
+    0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2,
+    0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45,
+    0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d,
+    0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0,
+    0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e,
+    0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21,
+    0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73,
+    0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0,
+    0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65,
+    0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7,
+    0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c,
+    0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a,
+    0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a,
+    0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95,
+    0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab,
+    0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c,
+    0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08,
+    0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32,
+    0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0,
+    0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc,
+    0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29,
+    0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e,
+    0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9,
+    0x94, 0x05, 0x7b, 0x0c, 0x20, 0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83,
+    0xf3, 0xcc, 0xa6, 0x53, 0x08, 0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54,
+    0x84, 0x24, 0xac, 0x7d, 0xa4, 0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a,
+    0x0e, 0x78, 0xc8, 0xcb, 0x75, 0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a,
+    0xf6, 0x18, 0x41, 0xa0, 0x7c, 0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99,
+    0x4c, 0xa6, 0x10, 0xa7, 0x08, 0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49,
+    0x58, 0xfb, 0x49, 0x54, 0x61, 0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1,
+    0x91, 0x96, 0xea, 0xb2, 0x1d, 0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30,
+    0x83, 0x40, 0xf9, 0xcb, 0xff, 0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c,
+    0x21, 0x4e, 0x10, 0x8d, 0x09, 0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6,
+    0x92, 0xa8, 0xc3, 0x2e, 0xd3, 0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d,
+    0xd5, 0x64, 0x3a, 0x09, 0x3f, 0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81,
+    0xf3, 0x97, 0xff, 0xcb, 0xe4, 0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c,
+    0x21, 0x1a, 0x12, 0x94, 0x8a, 0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51,
+    0x86, 0x5d, 0xa6, 0xef, 0x1b, 0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8,
+    0x74, 0x12, 0x7f, 0x99, 0x40, 0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f,
+    0xff, 0x97, 0xc8, 0x3f, 0x3c, 0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34,
+    0x25, 0x29, 0x15, 0x48, 0x42, 0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb,
+    0x4d, 0xde, 0x37, 0xa0, 0xe7, 0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24,
+    0xff, 0x32, 0x80, 0xaf, 0x61, 0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f,
+    0x90, 0x7e, 0x79, 0x94, 0xca, 0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52,
+    0x2a, 0x90, 0x84, 0x95, 0x8f, 0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc,
+    0x6f, 0x41, 0xcf, 0x19, 0x19, 0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65,
+    0x01, 0x5e, 0xc3, 0x08, 0x34, 0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc,
+    0xf3, 0x29, 0x94, 0xc2, 0x14, 0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21,
+    0x09, 0x2b, 0x1f, 0x69, 0x2a, 0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83,
+    0x9e, 0x32, 0x32, 0xdd, 0x56, 0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd,
+    0x86, 0x10, 0x68, 0x1f, 0x39, 0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53,
+    0x29, 0x84, 0x29, 0xc2, 0x11, 0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56,
+    0x3e, 0xd2, 0x55, 0x18, 0x65, 0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64,
+    0x65, 0xba, 0xac, 0x87, 0x41, 0x27, 0xf9, 0x94, 0x05, 0x7b, 0x0c, 0x20,
+    0xd0, 0x3e, 0x72, 0xff, 0xf9, 0x7c, 0x83, 0xf3, 0xcc, 0xa6, 0x53, 0x08,
+    0x53, 0x84, 0x23, 0x42, 0x52, 0x91, 0x54, 0x84, 0x24, 0xac, 0x7d, 0xa4,
+    0xaa, 0x30, 0xcb, 0xb4, 0xdd, 0xe3, 0x7a, 0x0e, 0x78, 0xc8, 0xcb, 0x75,
+    0x59, 0x0e, 0x82, 0x4f, 0xf3, 0x28, 0x0a, 0xf6, 0x18, 0x41, 0xa0, 0x7c,
+    0xe5, 0xff, 0xf2, 0xf9, 0x07, 0xe7, 0x99, 0x4c, 0xa6, 0x10, 0xa7, 0x08,
+    0x46, 0x84, 0xa5, 0x22, 0xa9, 0x08, 0x49, 0x58, 0xfb, 0x49, 0x54, 0x61,
+    0x97, 0x69, 0xbb, 0xc6, 0xf4, 0x1c, 0xf1, 0x91, 0x96, 0xea, 0xb2, 0x1d,
+    0x04, 0x9f, 0xe6, 0x50, 0x15, 0xec, 0x30, 0x83, 0x40, 0xf9, 0xcb, 0xff,
+    0xe5, 0xf2, 0x0f, 0xcf, 0x32, 0x99, 0x4c, 0x21, 0x4e, 0x10, 0x8d, 0x09,
+    0x4a, 0x45, 0x52, 0x10, 0x92, 0xb1, 0xf6, 0x92, 0xa8, 0xc3, 0x2e, 0xd3,
+    0x77, 0x8d, 0xe8, 0x39, 0xe3, 0x23, 0x2d, 0xd5, 0x64, 0x3a, 0x09, 0x3f,
+    0xcc, 0xa0, 0x2b, 0xd8, 0x61, 0x06, 0x81, 0xf3, 0x97, 0xff, 0xcb, 0xe4,
+    0x1f, 0x9e, 0x65, 0x32, 0x98, 0x42, 0x9c, 0x21, 0x1a, 0x12, 0x94, 0x8a,
+    0xa4, 0x21, 0x25, 0x63, 0xed, 0x25, 0x51, 0x86, 0x5d, 0xa6, 0xef, 0x1b,
+    0xd0, 0x73, 0xc6, 0x46, 0x5b, 0xaa, 0xc8, 0x74, 0x12, 0x7f, 0x99, 0x40,
+    0x57, 0xb0, 0xc2, 0x0d, 0x03, 0xe7, 0x2f, 0xff, 0x97, 0xc8, 0x3f, 0x3c,
+    0xca, 0x65, 0x30, 0x85, 0x38, 0x42, 0x34, 0x25, 0x29, 0x15, 0x48, 0x42,
+    0x4a, 0xc7, 0xda, 0x4a, 0xa3, 0x0c, 0xbb, 0x4d, 0xde, 0x37, 0xa0, 0xe7,
+    0x8c, 0x8c, 0xb7, 0x55, 0x90, 0xe8, 0x24, 0xff, 0x32, 0x80, 0xaf, 0x61,
+    0x84, 0x1a, 0x07, 0xce, 0x5f, 0xff, 0x2f, 0x90, 0x7e, 0x79, 0x94, 0xca,
+    0x61, 0x0a, 0x70, 0x84, 0x68, 0x4a, 0x52, 0x2a, 0x90, 0x84, 0x95, 0x8f,
+    0xb4, 0x95, 0x46, 0x19, 0x76, 0x9b, 0xbc, 0x6f, 0x41, 0xcf, 0x19, 0x19,
+    0x6e, 0xab, 0x21, 0xd0, 0x49, 0xfe, 0x65, 0x01, 0x5e, 0xc3, 0x08, 0x34,
+    0x0f, 0x9c, 0xbf, 0xfe, 0x5f, 0x20, 0xfc, 0xf3, 0x29, 0x94, 0xc2, 0x14,
+    0xe1, 0x08, 0xd0, 0x94, 0xa4, 0x55, 0x21, 0x09, 0x2b, 0x1f, 0x69, 0x2a,
+    0x8c, 0x32, 0xed, 0x37, 0x78, 0xde, 0x83, 0x9e, 0x32, 0x32, 0xdd, 0x56,
+    0x43, 0xa0, 0x93, 0xfc, 0xca, 0x02, 0xbd, 0x86, 0x10, 0x68, 0x1f, 0x39,
+    0x7f, 0xfc, 0xbe, 0x41, 0xf9, 0xe6, 0x53, 0x29, 0x84, 0x29, 0xc2, 0x11,
+    0xa1, 0x29, 0x48, 0xaa, 0x42, 0x12, 0x56, 0x3e, 0xd2, 0x55, 0x18, 0x65,
+    0xda, 0x6e, 0xf1, 0xbd, 0x07, 0x3c, 0x64, 0x65, 0xba, 0xac, 0x87, 0x41,
+    0x27, 0xf9, 0x94, 0x05, 0xa0,
+};
+static_assert(sizeof(kBytesTestReadSymbol6) == kNumBytesTestReadSymbol6, "");
+
+// The kBytesTestReadSymbol7[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][8] = {
+//   // pdf: 1/7, 1/7, 1/7, 1/7, 1/7, 1/7, 1/7
+//   { 32768 - 4681, 32768 - 9362, 32768 - 14043, 32768 - 18725,
+//     32768 - 23406, 32768 - 28087, 0, 0 },
+//   // pdf: 3/14, 2/14, 2/14, 2/14, 2/14, 2/14, 1/14
+//   { 32768 - 7022, 32768 - 11703, 32768 - 16384, 32768 - 21065,
+//     32768 - 25746, 32768 - 30427, 0, 0 },
+//   // pdf: 1/14, 1/14, 2/14, 2/14, 2/14, 3/14, 3/14
+//   { 32768 - 2341, 32768 - 4681, 32768 - 9362, 32768 - 14043,
+//     32768 - 18725, 32768 - 25746, 0, 0 },
+//   // pdf: 1/14, 2/14, 3/14, 3/14, 2/14, 2/14, 1/14
+//   { 32768 - 2341, 32768 - 7022, 32768 - 14043, 32768 - 21065,
+//     32768 - 25746, 32768 - 30427, 0, 0 },
+// };
+// constexpr int kSymbols[14][4] = { { 0, 4, 6, 3 },  //
+//                                   { 1, 5, 5, 2 },  //
+//                                   { 2, 6, 4, 1 },  //
+//                                   { 3, 0, 3, 0 },  //
+//                                   { 4, 1, 2, 6 },  //
+//                                   { 5, 2, 1, 5 },  //
+//                                   { 6, 3, 0, 4 },  //
+//                                   { 0, 0, 6, 5 },  //
+//                                   { 2, 1, 4, 3 },  //
+//                                   { 4, 3, 6, 1 },  //
+//                                   { 6, 5, 2, 4 },  //
+//                                   { 1, 0, 5, 2 },  //
+//                                   { 3, 2, 3, 2 },  //
+//                                   { 5, 4, 5, 3 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 14; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 7);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol7 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol7[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol7 = 19874;
+constexpr uint8_t kBytesTestReadSymbol7[] = {
+    0x1c, 0x6a, 0xfc, 0x4b, 0xd1, 0xb5, 0x8c, 0x20, 0x72, 0x45, 0x48, 0x21,
+    0x9e, 0x71, 0xe8, 0xc4, 0x91, 0x51, 0xab, 0xfd, 0x9c, 0x61, 0xf7, 0x98,
+    0xd4, 0x87, 0x71, 0xe6, 0x23, 0x37, 0x7e, 0xa3, 0xe0, 0x83, 0x48, 0x2e,
+    0xfe, 0xc3, 0xcb, 0x4f, 0x26, 0x9a, 0xd7, 0xe4, 0xca, 0xf4, 0x94, 0xb7,
+    0xbc, 0x03, 0xc9, 0xc3, 0x5e, 0x7f, 0xef, 0x9b, 0x37, 0xff, 0x8f, 0x62,
+    0xec, 0xb6, 0x09, 0x50, 0xa9, 0xc1, 0x4a, 0x97, 0xf4, 0xe7, 0x08, 0x57,
+    0x87, 0x2d, 0x10, 0xca, 0xbc, 0x93, 0x85, 0xfb, 0xc8, 0xc7, 0x8f, 0xc1,
+    0x4e, 0x1f, 0x50, 0xad, 0xba, 0x09, 0x9c, 0xf8, 0x94, 0x75, 0xdd, 0x2c,
+    0x78, 0x5d, 0xa0, 0x4a, 0xf3, 0x7b, 0xc0, 0xa7, 0x71, 0xa5, 0x20, 0xe6,
+    0xb0, 0xca, 0x09, 0xf2, 0x38, 0xfc, 0x61, 0x49, 0xdc, 0x83, 0x35, 0x1e,
+    0xdd, 0x08, 0xd7, 0xaa, 0x50, 0x0e, 0xc5, 0x57, 0x05, 0x44, 0xd7, 0xdb,
+    0x56, 0x2b, 0x1e, 0xe5, 0x33, 0x08, 0x7c, 0x3d, 0x25, 0x29, 0x05, 0x14,
+    0x3a, 0x93, 0xff, 0xe7, 0x40, 0x25, 0x30, 0x17, 0xc3, 0x50, 0xad, 0xec,
+    0xb3, 0x64, 0x87, 0x35, 0xb2, 0x5a, 0x1e, 0xa9, 0x48, 0xc8, 0x53, 0x30,
+    0xf1, 0x43, 0x6f, 0xe1, 0x2a, 0x8b, 0x81, 0x49, 0xbc, 0xa8, 0x8a, 0x8b,
+    0x2d, 0x1a, 0xc5, 0xcb, 0x47, 0xc1, 0xbc, 0xe0, 0x54, 0x98, 0xcc, 0x82,
+    0xe9, 0xa6, 0x3f, 0x70, 0x55, 0xe3, 0xe0, 0x7d, 0x5f, 0xa9, 0xc4, 0xc1,
+    0x62, 0x04, 0x2d, 0x15, 0xce, 0xab, 0x7c, 0xd9, 0x88, 0xc1, 0x67, 0x88,
+    0x3d, 0x6e, 0x96, 0x03, 0x6f, 0xa7, 0x6a, 0xc2, 0x6f, 0x20, 0x8c, 0xf4,
+    0xfb, 0x96, 0x0c, 0xb7, 0x14, 0xef, 0xa6, 0x83, 0xbd, 0x2b, 0x07, 0x8a,
+    0x2a, 0x66, 0xb8, 0x0d, 0xa8, 0x72, 0x2a, 0x78, 0x90, 0x2a, 0xe4, 0x46,
+    0x71, 0x8c, 0xcb, 0xcb, 0xbd, 0xfb, 0xc7, 0xa8, 0x9e, 0x9b, 0x6e, 0x6d,
+    0x2b, 0xc2, 0x1c, 0xea, 0x16, 0x3a, 0x06, 0xc0, 0xbc, 0xd7, 0x30, 0x8d,
+    0x87, 0x03, 0x04, 0x0d, 0x58, 0x58, 0x7b, 0x40, 0xf5, 0xe5, 0x7a, 0x51,
+    0x80, 0x7a, 0x16, 0xc2, 0xaf, 0x83, 0x43, 0x16, 0xb3, 0x3a, 0x1b, 0x24,
+    0x29, 0x80, 0x60, 0xee, 0x00, 0x91, 0x15, 0xdb, 0x28, 0x0d, 0xc2, 0xfb,
+    0x74, 0x48, 0xd9, 0x54, 0x97, 0x66, 0xa4, 0xba, 0xc8, 0x19, 0xff, 0x25,
+    0xca, 0xdf, 0x09, 0x66, 0xe4, 0xfe, 0xbb, 0x2b, 0x3f, 0x4a, 0x81, 0x5a,
+    0xa6, 0x54, 0x5c, 0xf0, 0xe4, 0x49, 0x38, 0x13, 0xfb, 0xa2, 0xee, 0xf9,
+    0x7d, 0x72, 0xa9, 0x37, 0x12, 0xf4, 0x04, 0x4e, 0x50, 0x19, 0x6f, 0x29,
+    0x9d, 0x0d, 0xe7, 0xc3, 0x6d, 0x65, 0x0b, 0x04, 0x53, 0x57, 0x0c, 0xb5,
+    0x71, 0xb4, 0xd6, 0xb0, 0xaa, 0xed, 0x38, 0x9e, 0x58, 0x55, 0x0d, 0xe4,
+    0xe6, 0x43, 0x16, 0x93, 0x46, 0x73, 0x39, 0x87, 0xaa, 0x69, 0x07, 0x9f,
+    0xd7, 0xb6, 0x77, 0x7d, 0xef, 0xc7, 0x19, 0x5d, 0x4f, 0x60, 0x20, 0x7e,
+    0xf0, 0x34, 0xbe, 0xe4, 0x31, 0xf3, 0x72, 0xe0, 0x89, 0xfb, 0xc8, 0x0a,
+    0xa9, 0xe6, 0x2c, 0x6b, 0xa5, 0xaa, 0xd5, 0x42, 0x69, 0xc0, 0x27, 0x3b,
+    0x17, 0x98, 0x73, 0xa3, 0x66, 0x10, 0xd7, 0xac, 0xf9, 0x7f, 0xb2, 0xf3,
+    0x38, 0x45, 0x23, 0xe2, 0xd4, 0xd2, 0x63, 0x1c, 0x84, 0xde, 0x25, 0xd4,
+    0x3c, 0x76, 0x58, 0x1a, 0xb6, 0x07, 0x22, 0x74, 0xc2, 0xf7, 0x2c, 0xe1,
+    0xc0, 0x51, 0x8c, 0xfa, 0xde, 0x6b, 0x35, 0x8c, 0x0f, 0x45, 0xf8, 0x5e,
+    0x61, 0x2d, 0x4e, 0x90, 0x2d, 0xb7, 0x6c, 0xaf, 0x71, 0x72, 0xdf, 0x68,
+    0xa9, 0xa2, 0x36, 0x79, 0xbd, 0xee, 0x88, 0xb0, 0xc8, 0xc9, 0xa6, 0x7e,
+    0x8e, 0xe8, 0x16, 0xbc, 0xd6, 0x82, 0x54, 0xac, 0x81, 0x42, 0x0f, 0xc9,
+    0x38, 0xd2, 0xe1, 0x17, 0x17, 0x4f, 0xc9, 0x0c, 0x39, 0xc0, 0x70, 0xd8,
+    0xd8, 0x17, 0x37, 0x4a, 0x93, 0x40, 0x83, 0xe3, 0x3f, 0x05, 0x25, 0xab,
+    0x6e, 0x58, 0xc1, 0x30, 0x62, 0x4d, 0xad, 0xcd, 0x1b, 0x7a, 0x4b, 0x08,
+    0xf8, 0x69, 0x85, 0xf1, 0x10, 0x84, 0x22, 0x54, 0x3a, 0x0c, 0x2d, 0x1b,
+    0xcd, 0x2d, 0xed, 0x95, 0x63, 0x1a, 0x9e, 0xbc, 0xb8, 0x76, 0x48, 0x65,
+    0xd1, 0xa6, 0x22, 0x98, 0x3e, 0xda, 0x00, 0x56, 0xf4, 0xd3, 0xc5, 0xb0,
+    0xb3, 0xb0, 0xfa, 0x0c, 0x84, 0x43, 0xfb, 0xa1, 0x1a, 0xba, 0x23, 0xc6,
+    0x72, 0xea, 0x83, 0x96, 0xff, 0xfd, 0x0d, 0xba, 0x40, 0x32, 0x3e, 0x1a,
+    0x61, 0x7b, 0xd5, 0x50, 0xfe, 0x41, 0xc8, 0x67, 0x71, 0xb4, 0xff, 0x24,
+    0xf8, 0x7b, 0xa2, 0x6d, 0x97, 0x84, 0x8e, 0x36, 0x30, 0x05, 0xc3, 0x60,
+    0x3b, 0x1c, 0xee, 0x34, 0x57, 0x05, 0x0f, 0x9e, 0xc2, 0xfd, 0xc8, 0x03,
+    0xab, 0x8a, 0x54, 0xde, 0x6a, 0x22, 0xa5, 0xb7, 0x38, 0xf5, 0x91, 0x08,
+    0xd4, 0xce, 0xe3, 0xa7, 0xb4, 0xcb, 0x58, 0x79, 0xe2, 0x34, 0x79, 0xfa,
+    0xc2, 0x85, 0x01, 0xeb, 0x53, 0xf1, 0xca, 0x5c, 0xa1, 0xfc, 0x35, 0xa2,
+    0x7b, 0x8f, 0x29, 0x1c, 0x67, 0xb0, 0x01, 0x1b, 0x5a, 0xa1, 0xc9, 0x3b,
+    0x2c, 0xc6, 0x35, 0xbb, 0x29, 0x46, 0x13, 0xfa, 0xd9, 0x40, 0x63, 0x3e,
+    0x6c, 0xa2, 0x36, 0x70, 0xe7, 0xc8, 0x76, 0x55, 0x70, 0xd2, 0x3f, 0xd1,
+    0xae, 0x83, 0x9d, 0xb9, 0x60, 0x47, 0x3e, 0x38, 0x0d, 0x08, 0x3f, 0xe0,
+    0x6b, 0x16, 0x7f, 0x7d, 0x7d, 0x40, 0x98, 0x99, 0xc1, 0x27, 0xf2, 0xb5,
+    0xfe, 0x33, 0xce, 0x83, 0x8c, 0x7d, 0xa7, 0xe6, 0xeb, 0x06, 0xdb, 0x4f,
+    0xca, 0x10, 0x82, 0x7b, 0x5e, 0xe8, 0xa9, 0x2e, 0xe0, 0x7a, 0xc2, 0x03,
+    0x75, 0x6e, 0x4e, 0x2b, 0xb6, 0xc3, 0x99, 0xf5, 0x41, 0xe9, 0x75, 0xe5,
+    0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf5, 0xf5, 0x89, 0x60, 0xae, 0x41, 0x13,
+    0x91, 0x77, 0x84, 0xb6, 0x79, 0xea, 0xcb, 0xeb, 0x8d, 0x05, 0xe2, 0x18,
+    0xfd, 0x36, 0x1f, 0x68, 0x34, 0xd1, 0x3c, 0xc3, 0xe1, 0x87, 0xd3, 0x2a,
+    0xb1, 0xc5, 0xac, 0xe2, 0xc3, 0xaf, 0xd1, 0x53, 0x61, 0x5e, 0xba, 0xcb,
+    0x32, 0xde, 0x97, 0xee, 0x4e, 0x58, 0xda, 0xda, 0x9d, 0x12, 0xe2, 0x75,
+    0x20, 0xd5, 0xb4, 0x64, 0x82, 0x75, 0x3e, 0xee, 0xb9, 0x13, 0x54, 0x54,
+    0x95, 0x36, 0x36, 0xa9, 0x85, 0x34, 0xa2, 0x37, 0xa0, 0x55, 0xe7, 0x1e,
+    0x9e, 0xb8, 0xbf, 0x36, 0x96, 0x1b, 0x1c, 0xa9, 0x16, 0xa9, 0x66, 0xb6,
+    0x30, 0x91, 0xc6, 0xfb, 0x51, 0x30, 0xc8, 0x19, 0x91, 0xca, 0x9e, 0x99,
+    0x88, 0x5a, 0x29, 0xbc, 0x10, 0x8e, 0x21, 0x93, 0x4b, 0xd1, 0x10, 0x10,
+    0x10, 0xca, 0x1a, 0x4d, 0x95, 0xd5, 0x0a, 0x08, 0xe4, 0xbc, 0xbc, 0xd4,
+    0xc4, 0x48, 0xaa, 0xb7, 0x55, 0x88, 0x55, 0x59, 0xfa, 0x05, 0x17, 0xae,
+    0x2f, 0xcd, 0xa5, 0x86, 0xc7, 0x2a, 0x45, 0xaa, 0x59, 0xad, 0x8c, 0x24,
+    0x71, 0xbe, 0xd4, 0x4c, 0x32, 0x06, 0x64, 0x72, 0xa7, 0xa6, 0x62, 0x16,
+    0x8a, 0x6f, 0x04, 0x23, 0x88, 0x64, 0xd2, 0xf4, 0x44, 0x04, 0x04, 0x32,
+    0x86, 0x93, 0x65, 0x75, 0x42, 0x82, 0x39, 0x2f, 0x2f, 0x35, 0x31, 0x12,
+    0x2a, 0xad, 0xd5, 0x62, 0x15, 0x56, 0x7e, 0x81, 0x48, 0x8e, 0xd3, 0x5e,
+    0x73, 0x9d, 0xa3, 0xec, 0xca, 0xdd, 0xbe, 0x89, 0xd7, 0xb8, 0xa3, 0x59,
+    0xeb, 0x97, 0xb3, 0xf2, 0xf1, 0xa6, 0x4b, 0x8e, 0x89, 0xe6, 0xe9, 0x0a,
+    0x84, 0x9b, 0xbf, 0xd3, 0x6b, 0xd5, 0xbf, 0x1e, 0x7f, 0x87, 0x55, 0x76,
+    0x5e, 0xa7, 0xe6, 0x3e, 0xcf, 0x6c, 0x16, 0x5f, 0xf1, 0xf6, 0xf0, 0x3e,
+    0xd4, 0x4f, 0x71, 0xe5, 0x23, 0x8c, 0xf6, 0xa6, 0x11, 0xc3, 0xf8, 0x7b,
+    0xc7, 0xea, 0x1a, 0x6a, 0xc7, 0x13, 0x2e, 0x5a, 0xf6, 0x61, 0x9b, 0x71,
+    0x61, 0x3b, 0x66, 0x37, 0xd4, 0x28, 0xa6, 0xbf, 0xd6, 0xc6, 0x2e, 0x29,
+    0xd6, 0x38, 0xb5, 0x9c, 0x58, 0x75, 0xfa, 0x2a, 0x6c, 0x2f, 0xa3, 0x8b,
+    0x02, 0xbe, 0xdd, 0x38, 0xdb, 0x4f, 0xca, 0x25, 0x43, 0x09, 0x44, 0x79,
+    0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x45, 0xaa, 0x53, 0x29, 0x8e, 0xd7, 0x81,
+    0x74, 0xdd, 0xfa, 0x65, 0x18, 0xd5, 0xc5, 0xae, 0x4f, 0xa8, 0x57, 0xf6,
+    0x04, 0xf5, 0xcd, 0xd8, 0xa0, 0x26, 0xb4, 0x41, 0xe3, 0x02, 0xc9, 0x95,
+    0xfe, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0xe6, 0x35, 0xff, 0x03,
+    0x5f, 0x8c, 0xac, 0x56, 0x1e, 0xec, 0x29, 0xfc, 0x45, 0x97, 0x61, 0x74,
+    0xa6, 0xed, 0x7c, 0x67, 0x7a, 0xf5, 0xdd, 0x80, 0xaf, 0x42, 0x04, 0x7f,
+    0x82, 0x46, 0x15, 0x56, 0xea, 0xb1, 0x0a, 0xab, 0x3f, 0x40, 0xa4, 0x47,
+    0x69, 0xaf, 0x39, 0xce, 0xd1, 0xf6, 0x65, 0x6e, 0xf0, 0x45, 0x5e, 0xfc,
+    0x51, 0xac, 0xf5, 0xcb, 0xd9, 0xf9, 0x78, 0xd3, 0x25, 0xc7, 0x44, 0xf3,
+    0x74, 0x85, 0x42, 0x4d, 0xdf, 0xe9, 0xb5, 0xea, 0xdf, 0x8f, 0x3f, 0xc3,
+    0xaa, 0xbb, 0x2f, 0x53, 0xf3, 0x1f, 0x67, 0xb6, 0x0b, 0x2f, 0xf8, 0xfb,
+    0x78, 0x1f, 0x6a, 0x27, 0xb8, 0xf2, 0x91, 0xc6, 0x7b, 0x53, 0x08, 0xe1,
+    0xfc, 0x3d, 0xe3, 0xf5, 0x0d, 0x35, 0x63, 0x89, 0x97, 0x2d, 0x7b, 0x30,
+    0xcd, 0xb8, 0xb0, 0x9d, 0xb3, 0x1b, 0xea, 0x14, 0x53, 0x5f, 0xeb, 0x63,
+    0x17, 0x14, 0xeb, 0x1c, 0x5a, 0xce, 0x2c, 0x3a, 0xfd, 0x15, 0x36, 0x17,
+    0xd1, 0xc5, 0x81, 0x5f, 0x6e, 0x9c, 0x6d, 0xa7, 0xe5, 0x12, 0xa1, 0x84,
+    0xa2, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x22, 0xd5, 0x29, 0x94, 0xc7,
+    0x6b, 0xc0, 0xba, 0x6e, 0xfd, 0x32, 0x8c, 0x6a, 0xe2, 0xd7, 0x27, 0xd4,
+    0x2b, 0xfb, 0x02, 0x7a, 0xe6, 0xec, 0x50, 0x13, 0x5a, 0x20, 0xf1, 0x81,
+    0x64, 0xca, 0xff, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x73, 0x1a,
+    0xff, 0x81, 0xaf, 0xc6, 0x56, 0x2b, 0x0f, 0x76, 0x14, 0xfe, 0x22, 0xcb,
+    0xb0, 0xba, 0x53, 0x76, 0xbe, 0x33, 0xbd, 0x7a, 0xee, 0xc0, 0x57, 0xa1,
+    0x02, 0x3f, 0xc1, 0x23, 0x0a, 0xab, 0x75, 0x58, 0x85, 0x55, 0x9f, 0xa0,
+    0x52, 0x23, 0xb4, 0xd7, 0x9c, 0xe7, 0x68, 0xfb, 0x32, 0xb7, 0x78, 0x22,
+    0xaf, 0x7e, 0x28, 0xd6, 0x7a, 0xe5, 0xec, 0xfc, 0xbc, 0x69, 0x92, 0xe3,
+    0xa2, 0x79, 0xba, 0x42, 0xa1, 0x26, 0xef, 0xf4, 0xda, 0xf5, 0x6f, 0xc7,
+    0x9f, 0xe1, 0xd5, 0x5d, 0x97, 0xa9, 0xf9, 0x8f, 0xb3, 0xdb, 0x05, 0x97,
+    0xfc, 0x7d, 0xbc, 0x0f, 0xb5, 0x13, 0xdc, 0x79, 0x48, 0xe3, 0x3d, 0xa9,
+    0x84, 0x70, 0xfe, 0x1e, 0xf1, 0xfa, 0x86, 0x9a, 0xb1, 0xc4, 0xcb, 0x96,
+    0xbd, 0x98, 0x66, 0xdc, 0x58, 0x4e, 0xd9, 0x8d, 0xf5, 0x0a, 0x29, 0xaf,
+    0xf5, 0xb1, 0x8b, 0x8a, 0x75, 0x8e, 0x2d, 0x67, 0x16, 0x1d, 0x7e, 0x8a,
+    0x9b, 0x0b, 0xe8, 0xe2, 0xc0, 0xaf, 0xb7, 0x4e, 0x36, 0xd3, 0xf2, 0x89,
+    0x50, 0xc2, 0x51, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x91, 0x6a, 0x94,
+    0xca, 0x63, 0xb5, 0xe0, 0x5d, 0x37, 0x7e, 0x99, 0x46, 0x35, 0x71, 0x6b,
+    0x93, 0xea, 0x15, 0xfd, 0x81, 0x3d, 0x73, 0x76, 0x28, 0x09, 0xad, 0x10,
+    0x78, 0xc0, 0xb2, 0x65, 0x7f, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70,
+    0x39, 0x8d, 0x7f, 0xc0, 0xd7, 0xe3, 0x2b, 0x15, 0x87, 0xbb, 0x0a, 0x7f,
+    0x11, 0x65, 0xd8, 0x5d, 0x29, 0xbb, 0x5f, 0x19, 0xde, 0xbd, 0x77, 0x60,
+    0x2b, 0xd0, 0x81, 0x1f, 0xe0, 0x91, 0x85, 0x55, 0xba, 0xac, 0x42, 0xaa,
+    0xcf, 0xd0, 0x29, 0x11, 0xda, 0x6b, 0xce, 0x73, 0xb4, 0x7d, 0x99, 0x5b,
+    0xbc, 0x11, 0x57, 0xbf, 0x14, 0x6b, 0x3d, 0x72, 0xf6, 0x7e, 0x5e, 0x34,
+    0xc9, 0x71, 0xd1, 0x3c, 0xdd, 0x21, 0x50, 0x93, 0x77, 0xfa, 0x6d, 0x7a,
+    0xb7, 0xe3, 0xcf, 0xf0, 0xea, 0xae, 0xe7, 0x1d, 0xfb, 0x2a, 0x2f, 0x0e,
+    0xe3, 0xde, 0xf4, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f,
+    0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90,
+    0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d,
+    0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13,
+    0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49,
+    0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc,
+    0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5,
+    0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a,
+    0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04,
+    0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd,
+    0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba,
+    0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b,
+    0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88,
+    0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7,
+    0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17,
+    0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68,
+    0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf,
+    0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73,
+    0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32,
+    0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61,
+    0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc,
+    0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a,
+    0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a,
+    0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1,
+    0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7,
+    0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b,
+    0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6,
+    0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff,
+    0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18,
+    0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8,
+    0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa,
+    0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa,
+    0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62,
+    0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d,
+    0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6,
+    0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97,
+    0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41,
+    0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb,
+    0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb,
+    0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9,
+    0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda,
+    0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f,
+    0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c,
+    0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84,
+    0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84,
+    0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6,
+    0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74,
+    0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92,
+    0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc,
+    0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51,
+    0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2,
+    0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2,
+    0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a,
+    0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb,
+    0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3,
+    0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c,
+    0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23,
+    0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01,
+    0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe,
+    0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c,
+    0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e,
+    0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c,
+    0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d,
+    0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86,
+    0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c,
+    0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f,
+    0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1,
+    0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc,
+    0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06,
+    0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3,
+    0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d,
+    0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0,
+    0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32,
+    0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6,
+    0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31,
+    0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64,
+    0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a,
+    0xbf, 0x79, 0x95, 0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c,
+    0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea,
+    0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41,
+    0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d,
+    0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5,
+    0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca,
+    0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60,
+    0x87, 0x11, 0xd9, 0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01,
+    0x07, 0xef, 0x37, 0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b,
+    0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98,
+    0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66,
+    0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a,
+    0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf,
+    0xde, 0x65, 0x50, 0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07,
+    0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e,
+    0x9d, 0x79, 0xee, 0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47,
+    0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f,
+    0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42,
+    0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a,
+    0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21,
+    0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41,
+    0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2,
+    0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21,
+    0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2,
+    0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9,
+    0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7,
+    0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4,
+    0xb0, 0xc5, 0x37, 0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7,
+    0x5e, 0x7b, 0xa5, 0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0,
+    0x8d, 0x6c, 0x93, 0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7,
+    0xbd, 0x2f, 0xe5, 0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7,
+    0x58, 0x82, 0x8f, 0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b,
+    0x63, 0xd6, 0x10, 0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71,
+    0x1d, 0x97, 0x13, 0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e,
+    0xf3, 0x72, 0x51, 0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82,
+    0xff, 0xb5, 0xdc, 0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d,
+    0x0c, 0x3e, 0x99, 0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99,
+    0xe4, 0x39, 0xe0, 0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e,
+    0x7d, 0x09, 0x1e, 0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6,
+    0x55, 0x08, 0x0f, 0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c,
+    0x31, 0x4d, 0xf6, 0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7,
+    0x9e, 0xe9, 0x66, 0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23,
+    0x5b, 0x24, 0xf4, 0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef,
+    0x4b, 0xf9, 0x66, 0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6,
+    0x20, 0xa3, 0xec, 0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8,
+    0xf5, 0x84, 0x31, 0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47,
+    0x65, 0xc4, 0xe1, 0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc,
+    0xdc, 0x94, 0x7d, 0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf,
+    0xed, 0x77, 0x08, 0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43,
+    0x0f, 0xa6, 0x65, 0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79,
+    0x0e, 0x78, 0x36, 0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f,
+    0x42, 0x47, 0x99, 0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95,
+    0x42, 0x03, 0xec, 0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c,
+    0x53, 0x7d, 0x80, 0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7,
+    0xba, 0x59, 0x93, 0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6,
+    0xc9, 0x3d, 0x33, 0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2,
+    0xfe, 0x59, 0x8c, 0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88,
+    0x28, 0xfb, 0x21, 0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d,
+    0x61, 0x0c, 0x55, 0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9,
+    0x71, 0x38, 0x60, 0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37,
+    0x25, 0x1f, 0x51, 0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb,
+    0x5d, 0xc2, 0x08, 0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3,
+    0xe9, 0x99, 0x69, 0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43,
+    0x9e, 0x0d, 0xa8, 0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0,
+    0x91, 0xe6, 0x53, 0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50,
+    0x80, 0xfb, 0x04, 0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14,
+    0xdf, 0x60, 0x08, 0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee,
+    0x96, 0x64, 0xda, 0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2,
+    0x4f, 0x4c, 0xc4, 0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf,
+    0x96, 0x63, 0x34, 0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a,
+    0x3e, 0xc8, 0x55, 0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58,
+    0x43, 0x15, 0x7e, 0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c,
+    0x4e, 0x18, 0x3a, 0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9,
+    0x47, 0xd4, 0x74, 0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7,
+    0x70, 0x82, 0x38, 0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa,
+    0x66, 0x5a, 0x7a, 0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7,
+    0x83, 0x6a, 0x16, 0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24,
+    0x79, 0x94, 0xd1, 0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20,
+    0x3e, 0xc1, 0x0e, 0x23, 0xb2, 0xe2, 0x70, 0xc1, 0xd4, 0xb0, 0xc5, 0x37,
+    0xd8, 0x02, 0x0f, 0xde, 0x6e, 0x4a, 0x3e, 0xa3, 0xa7, 0x5e, 0x7b, 0xa5,
+    0x99, 0x36, 0x90, 0x5f, 0xf6, 0xbb, 0x84, 0x11, 0xc0, 0x8d, 0x6c, 0x93,
+    0xd3, 0x31, 0x0d, 0xa1, 0x87, 0xd3, 0x32, 0xd3, 0xd7, 0xbd, 0x2f, 0xe5,
+    0x98, 0xcd, 0x13, 0x3c, 0x87, 0x3c, 0x1b, 0x50, 0xb7, 0x58, 0x82, 0x8f,
+    0xb2, 0x15, 0x49, 0xcf, 0xa1, 0x23, 0xcc, 0xa6, 0x8b, 0x63, 0xd6, 0x10,
+    0xc5, 0x5f, 0xbc, 0xca, 0xa1, 0x01, 0xf6, 0x08, 0x71, 0x1d, 0x97, 0x13,
+    0x86, 0x0e, 0xa5, 0x86, 0x29, 0xbe, 0xc0, 0x10, 0x7e, 0xf3, 0x72, 0x51,
+    0xf5, 0x1d, 0x3a, 0xf3, 0xdd, 0x2c, 0xc9, 0xb4, 0x82, 0xff, 0xb5, 0xdc,
+    0x20, 0x8e, 0x04, 0x6b, 0x64, 0x9e, 0x99, 0x88, 0x6d, 0x0c, 0x3e, 0x99,
+    0x96, 0x9e, 0xbd, 0xe9, 0x7f, 0x2c, 0xc6, 0x68, 0x99, 0xe4, 0x39, 0xe0,
+    0xda, 0x85, 0xba, 0xc4, 0x14, 0x7d, 0x90, 0xaa, 0x4e, 0x7d, 0x09, 0x1e,
+    0x65, 0x34, 0x5b, 0x1e, 0xb0, 0x86, 0x2a, 0xfd, 0xe6, 0x55, 0x08, 0x0f,
+    0xb0, 0x43, 0x88, 0xec, 0xb8, 0x9c, 0x30, 0x75, 0x2c, 0x31, 0x4d, 0xf6,
+    0x00, 0x83, 0xf7, 0x9b, 0x92, 0x8f, 0xa8, 0xe9, 0xd7, 0x9e, 0xe9, 0x66,
+    0x4d, 0xa4, 0x17, 0xfd, 0xae, 0xe1, 0x04, 0x70, 0x23, 0x5b, 0x24, 0xf4,
+    0xcc, 0x43, 0x68, 0x61, 0xf4, 0xcc, 0xb4, 0xf5, 0xef, 0x4b, 0xf9, 0x66,
+    0x33, 0x44, 0xcf, 0x21, 0xcf, 0x06, 0xd4, 0x2d, 0xd6, 0x20, 0xa3, 0xec,
+    0x85, 0x52, 0x73, 0xe8, 0x48, 0xf3, 0x29, 0xa2, 0xd8, 0xf5, 0x84, 0x31,
+    0x57, 0xef, 0x32, 0xa8, 0x40, 0x7d, 0x82, 0x1c, 0x47, 0x65, 0xc4, 0xe1,
+    0x83, 0xa9, 0x61, 0x8a, 0x6f, 0xb0, 0x04, 0x1f, 0xbc, 0xdc, 0x94, 0x7d,
+    0x47, 0x4e, 0xbc, 0xf7, 0x4b, 0x32, 0x6d, 0x20, 0xbf, 0xed, 0x77, 0x08,
+    0x23, 0x81, 0x1a, 0xd9, 0x27, 0xa6, 0x62, 0x1b, 0x43, 0x0f, 0xa6, 0x65,
+    0xa7, 0xaf, 0x7a, 0x5f, 0xcb, 0x31, 0x9a, 0x26, 0x79, 0x0e, 0x78, 0x36,
+    0xa1, 0x6e, 0xb1, 0x05, 0x1f, 0x64, 0x2a, 0x93, 0x9f, 0x42, 0x47, 0x99,
+    0x4d, 0x16, 0xc7, 0xac, 0x21, 0x8a, 0xbf, 0x79, 0x95, 0x42, 0x03, 0xec,
+    0x10, 0xe2, 0x3b, 0x2e, 0x27, 0x0c, 0x1d, 0x4b, 0x0c, 0x53, 0x7d, 0x80,
+    0x20, 0xfd, 0xe6, 0xe4, 0xa3, 0xea, 0x3a, 0x75, 0xe7, 0xba, 0x59, 0x93,
+    0x69, 0x05, 0xff, 0x6b, 0xb8, 0x41, 0x1c, 0x08, 0xd6, 0xc9, 0x3d, 0x33,
+    0x10, 0xda, 0x18, 0x7d, 0x33, 0x2d, 0x3d, 0x7b, 0xd2, 0xfe, 0x59, 0x8c,
+    0xd1, 0x33, 0xc8, 0x73, 0xc1, 0xb5, 0x0b, 0x75, 0x88, 0x28, 0xfb, 0x21,
+    0x54, 0x9c, 0xfa, 0x12, 0x3c, 0xca, 0x68, 0xb6, 0x3d, 0x61, 0x0c, 0x55,
+    0xfb, 0xcc, 0xaa, 0x10, 0x1f, 0x60, 0x87, 0x11, 0xd9, 0x71, 0x38, 0x60,
+    0xea, 0x58, 0x62, 0x9b, 0xec, 0x01, 0x07, 0xef, 0x37, 0x25, 0x1f, 0x51,
+    0xd3, 0xaf, 0x3d, 0xd2, 0xcc, 0x9b, 0x48, 0x2f, 0xfb, 0x5d, 0xc2, 0x08,
+    0xe0, 0x46, 0xb6, 0x49, 0xe9, 0x98, 0x86, 0xd0, 0xc3, 0xe9, 0x99, 0x69,
+    0xeb, 0xde, 0x97, 0xf2, 0xcc, 0x66, 0x89, 0x9e, 0x43, 0x9e, 0x0d, 0xa8,
+    0x5b, 0xac, 0x41, 0x47, 0xd9, 0x0a, 0xa4, 0xe7, 0xd0, 0x91, 0xe6, 0x53,
+    0x45, 0xb1, 0xeb, 0x08, 0x62, 0xaf, 0xde, 0x65, 0x50, 0x80, 0xfb, 0x04,
+    0x38, 0x8e, 0xcb, 0x89, 0xc3, 0x07, 0x52, 0xc3, 0x14, 0xdf, 0x60, 0x08,
+    0x3f, 0x79, 0xb9, 0x28, 0xfa, 0x8e, 0x9d, 0x79, 0xee, 0x96, 0x64, 0xda,
+    0x41, 0x7f, 0xda, 0xee, 0x10, 0x47, 0x02, 0x35, 0xb2, 0x4f, 0x4c, 0xc4,
+    0x36, 0x86, 0x1f, 0x4c, 0xcb, 0x4f, 0x5e, 0xf4, 0xbf, 0x96, 0x63, 0x34,
+    0x4c, 0xf2, 0x1c, 0xf0, 0x6d, 0x42, 0xdd, 0x62, 0x0a, 0x3e, 0xc8, 0x55,
+    0x27, 0x3e, 0x84, 0x8f, 0x32, 0x9a, 0x2d, 0x8f, 0x58, 0x43, 0x15, 0x7e,
+    0xf3, 0x2a, 0x84, 0x07, 0xd8, 0x21, 0xc4, 0x76, 0x5c, 0x4e, 0x18, 0x3a,
+    0x96, 0x18, 0xa6, 0xfb, 0x00, 0x41, 0xfb, 0xcd, 0xc9, 0x47, 0xd4, 0x74,
+    0xeb, 0xcf, 0x74, 0xb3, 0x26, 0xd2, 0x0b, 0xfe, 0xd7, 0x70, 0x82, 0x38,
+    0x11, 0xad, 0x92, 0x7a, 0x66, 0x21, 0xb4, 0x30, 0xfa, 0x66, 0x5a, 0x7a,
+    0xf7, 0xa5, 0xfc, 0xb3, 0x19, 0xa2, 0x67, 0x90, 0xe7, 0x83, 0x6a, 0x16,
+    0xeb, 0x10, 0x51, 0xf6, 0x42, 0xa9, 0x39, 0xf4, 0x24, 0x79, 0x94, 0xd1,
+    0x6c, 0x7a, 0xc2, 0x18, 0xab, 0xf7, 0x99, 0x54, 0x20, 0x3e, 0xc1, 0x0e,
+    0x23, 0xb3,
+};
+static_assert(sizeof(kBytesTestReadSymbol7) == kNumBytesTestReadSymbol7, "");
+
+// The kBytesTestReadSymbol8[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][9] = {
+//   // pdf: 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8, 1/8
+//   { 32768 - 4096, 32768 - 8192, 32768 - 12288, 32768 - 16384,
+//     32768 - 20480, 32768 - 24576, 32768 - 28672, 0, 0 },
+//   // pdf: 3/16, 2/16, 2/16, 2/16, 2/16, 2/16, 2/16, 1/16
+//   { 32768 - 6144, 32768 - 10240, 32768 - 14336, 32768 - 18432,
+//     32768 - 22528, 32768 - 26624, 32768 - 30720, 0, 0 },
+//   // pdf: 1/16, 1/16, 2/16, 2/16, 2/16, 2/16, 3/16, 3/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 8192, 32768 - 12288,
+//     32768 - 16384, 32768 - 20480, 32768 - 26624, 0, 0 },
+//   // pdf: 1/16, 1/16, 3/16, 3/16, 3/16, 3/16, 1/16, 1/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 10240, 32768 - 16384,
+//     32768 - 22528, 32768 - 28672, 32768 - 30720, 0, 0 },
+// };
+// constexpr int kSymbols[16][4] = { { 0, 4, 7, 3 },  //
+//                                   { 1, 5, 6, 2 },  //
+//                                   { 2, 6, 5, 1 },  //
+//                                   { 3, 7, 4, 0 },  //
+//                                   { 4, 0, 3, 7 },  //
+//                                   { 5, 1, 2, 6 },  //
+//                                   { 6, 2, 1, 5 },  //
+//                                   { 7, 3, 0, 4 },  //
+//                                   { 0, 0, 6, 5 },  //
+//                                   { 2, 1, 4, 3 },  //
+//                                   { 4, 3, 6, 4 },  //
+//                                   { 6, 5, 2, 2 },  //
+//                                   { 1, 0, 7, 3 },  //
+//                                   { 3, 2, 5, 5 },  //
+//                                   { 5, 4, 7, 2 },  //
+//                                   { 7, 6, 3, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 1024; ++i) {
+//   for (int j = 0; j < 16; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 8);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("  constexpr size_t kNumBytesTestReadSymbol8 = %u;\n", bw.pos);
+// printf("  constexpr uint8_t kBytesTestReadSymbol8[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n      ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n  };\n");
+
+constexpr size_t kNumBytesTestReadSymbol8 = 24195;
+constexpr uint8_t kBytesTestReadSymbol8[] = {
+    0x15, 0x60, 0xa8, 0x52, 0xf4, 0x88, 0xdd, 0x23, 0x40, 0xb1, 0xd6, 0xd2,
+    0xc2, 0xa2, 0x4c, 0x0a, 0x5d, 0xba, 0xfe, 0xd2, 0x36, 0xd9, 0xcd, 0x51,
+    0x10, 0x25, 0x13, 0x29, 0xfa, 0x0d, 0x87, 0xf9, 0xd1, 0x6f, 0xf2, 0x0d,
+    0x3a, 0xbe, 0xd9, 0x83, 0x99, 0xd1, 0xdf, 0x24, 0x70, 0x28, 0xdb, 0x63,
+    0xf6, 0x7c, 0x07, 0x2b, 0x68, 0xa3, 0x7a, 0x85, 0xd1, 0x47, 0xba, 0x59,
+    0x18, 0x7e, 0x64, 0x3b, 0xac, 0xaf, 0xe3, 0x3a, 0x99, 0x82, 0x30, 0x92,
+    0x7a, 0x93, 0x67, 0x9f, 0xac, 0x53, 0xf8, 0xdb, 0x03, 0x71, 0xc7, 0x4a,
+    0xa9, 0xec, 0x10, 0xc9, 0xed, 0x5b, 0xa6, 0xd5, 0xc3, 0xdd, 0x81, 0x8d,
+    0x25, 0xbe, 0x57, 0xcd, 0x01, 0x65, 0x33, 0x6c, 0x12, 0xe1, 0x37, 0x8b,
+    0xf1, 0x08, 0x27, 0x3c, 0x5a, 0x30, 0x9f, 0x2d, 0x41, 0x2e, 0x75, 0x49,
+    0xab, 0xa6, 0xb6, 0x4c, 0xbe, 0xe0, 0xd0, 0x20, 0x74, 0xeb, 0x05, 0x79,
+    0x91, 0x60, 0xfd, 0xb2, 0x39, 0x54, 0xd9, 0x0c, 0x11, 0x04, 0x1f, 0x7b,
+    0x5d, 0x2d, 0xe3, 0x3f, 0x48, 0xe4, 0x56, 0x11, 0x3d, 0x48, 0xdb, 0x5c,
+    0x1c, 0x8b, 0x81, 0xbb, 0x8a, 0x53, 0xb7, 0x48, 0x5b, 0x15, 0x9b, 0x35,
+    0xc1, 0x18, 0x0f, 0xc3, 0x1e, 0x1c, 0x16, 0x7e, 0x0a, 0xbf, 0x16, 0x0a,
+    0xf5, 0x3f, 0xbe, 0x19, 0xc0, 0x0f, 0xa4, 0x59, 0xae, 0x0a, 0xcf, 0xf4,
+    0x00, 0xb2, 0xff, 0x3a, 0xd8, 0x7f, 0x6c, 0xcf, 0x4f, 0xca, 0xa1, 0x40,
+    0x47, 0x8e, 0xd0, 0x44, 0x49, 0x5a, 0x48, 0xe6, 0x86, 0x80, 0xbb, 0x57,
+    0x36, 0x6e, 0x80, 0xf1, 0xd1, 0xd8, 0xb8, 0xad, 0xb7, 0x6b, 0x11, 0x79,
+    0x02, 0x95, 0x20, 0xcf, 0x6f, 0x21, 0xe6, 0x5c, 0x65, 0x69, 0x4a, 0xf2,
+    0x6f, 0x87, 0x68, 0xf1, 0xda, 0x3b, 0xe1, 0x64, 0x5c, 0xfc, 0x21, 0x02,
+    0x7b, 0xf6, 0x39, 0x77, 0x36, 0x29, 0x3d, 0xda, 0x16, 0x2e, 0xdb, 0x55,
+    0xac, 0x5a, 0x3a, 0x94, 0x9c, 0x79, 0x2c, 0x92, 0xa4, 0xe3, 0xe2, 0x87,
+    0xd8, 0x14, 0x21, 0x76, 0xae, 0xf1, 0x8d, 0x7d, 0xdc, 0xde, 0x46, 0xd9,
+    0xbd, 0xb6, 0x5f, 0xae, 0x77, 0xd0, 0xd7, 0x01, 0xed, 0xbe, 0x5f, 0xee,
+    0x1a, 0x20, 0x0f, 0x88, 0x5c, 0x8a, 0x44, 0xad, 0x8f, 0x8f, 0x66, 0x9d,
+    0x43, 0xf4, 0x41, 0x0a, 0xa1, 0xc8, 0x5c, 0xbc, 0x37, 0xe2, 0xca, 0xd2,
+    0xd8, 0x27, 0x54, 0xdb, 0xdf, 0x7f, 0x0a, 0xd7, 0x65, 0x19, 0x99, 0x1a,
+    0x92, 0x53, 0xdd, 0x1e, 0x5f, 0xad, 0x24, 0x8a, 0x8d, 0x76, 0xc4, 0xf7,
+    0x7e, 0x74, 0xfe, 0x68, 0x99, 0x42, 0xfa, 0xaa, 0x6e, 0xdd, 0x91, 0xd4,
+    0x71, 0x10, 0xb7, 0x45, 0xa8, 0x5f, 0x84, 0x0d, 0xeb, 0x38, 0x3e, 0xaa,
+    0xf1, 0xad, 0x86, 0x8f, 0x1a, 0x3e, 0x9a, 0x29, 0xc7, 0x7b, 0xa7, 0xdf,
+    0x51, 0x3d, 0x49, 0x08, 0x09, 0x69, 0x40, 0x9d, 0x45, 0xb8, 0x55, 0xce,
+    0x96, 0x6c, 0x8b, 0xc6, 0xc9, 0x25, 0x70, 0xc9, 0xb3, 0xa8, 0xa8, 0x08,
+    0x33, 0x7b, 0xca, 0x21, 0x9e, 0x5b, 0xb5, 0x02, 0x7f, 0xa3, 0x34, 0x7c,
+    0x3d, 0xba, 0x91, 0x2e, 0xae, 0xc3, 0x1f, 0x9e, 0xc2, 0x4f, 0xdf, 0xa9,
+    0x39, 0x9b, 0x9d, 0x6e, 0xc7, 0x90, 0xeb, 0x2b, 0xb0, 0x3f, 0xde, 0x37,
+    0xb7, 0x94, 0x3d, 0x4b, 0x2c, 0x42, 0x3f, 0x47, 0xad, 0xc9, 0x23, 0xcb,
+    0x4d, 0xc4, 0xdd, 0x5e, 0x67, 0x11, 0x9d, 0x45, 0xb8, 0x55, 0xce, 0x98,
+    0x05, 0xce, 0x97, 0x99, 0x57, 0x84, 0x8d, 0x79, 0x97, 0x81, 0x4b, 0x8a,
+    0x9c, 0x76, 0x73, 0x9a, 0xf7, 0x59, 0x54, 0x07, 0x6c, 0x11, 0x41, 0x44,
+    0xf0, 0xa6, 0x2a, 0x5e, 0xb1, 0x48, 0x47, 0x39, 0xbb, 0x1b, 0xf0, 0x25,
+    0x07, 0xe7, 0xd2, 0xbb, 0x9b, 0x9b, 0xd7, 0x7e, 0xc8, 0xdd, 0xae, 0xb6,
+    0x23, 0x5e, 0xe0, 0xa5, 0xb0, 0xc6, 0xb6, 0x81, 0xe9, 0x51, 0x20, 0xe9,
+    0x2f, 0x89, 0xcd, 0x13, 0x96, 0x21, 0x19, 0xc5, 0xd1, 0x65, 0x65, 0x88,
+    0xd9, 0x7b, 0x87, 0xdc, 0xfb, 0x38, 0x54, 0x22, 0x27, 0xc4, 0xc4, 0x16,
+    0x56, 0xff, 0x76, 0x69, 0xa6, 0x3b, 0xa0, 0x6d, 0xab, 0xb8, 0xdf, 0xc1,
+    0xc2, 0xff, 0x65, 0x8f, 0x85, 0xbc, 0x69, 0xc0, 0xa5, 0x9a, 0xef, 0xf1,
+    0x37, 0x57, 0x99, 0xc4, 0x67, 0x51, 0x6e, 0xdf, 0x30, 0xa4, 0x86, 0x47,
+    0x34, 0x5f, 0x5e, 0x3c, 0xde, 0x6e, 0x96, 0x74, 0x5c, 0xbd, 0xca, 0xa3,
+    0x50, 0xe4, 0xe8, 0x63, 0xdf, 0xb0, 0xf1, 0xbe, 0xa2, 0x58, 0x23, 0x7a,
+    0x4a, 0x29, 0x62, 0x1f, 0x03, 0xf1, 0xe9, 0x19, 0xdd, 0x68, 0xe8, 0x1a,
+    0x7a, 0x9b, 0x40, 0x0d, 0xb0, 0x15, 0x8b, 0x14, 0x63, 0x08, 0xa4, 0x21,
+    0xa6, 0x0b, 0x34, 0x8a, 0x3e, 0x76, 0x7a, 0xa8, 0x11, 0x81, 0x16, 0x12,
+    0xa5, 0xc6, 0x7a, 0xf1, 0xa0, 0x20, 0xff, 0x33, 0x3b, 0xa5, 0x43, 0xc7,
+    0x42, 0xd3, 0x22, 0x90, 0x16, 0xa2, 0x28, 0x18, 0xa4, 0xc7, 0x24, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22,
+    0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93,
+    0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15,
+    0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b,
+    0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab,
+    0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf,
+    0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58,
+    0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd,
+    0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3,
+    0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8,
+    0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f,
+    0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41,
+    0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe,
+    0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b,
+    0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3,
+    0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59,
+    0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99,
+    0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8,
+    0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc,
+    0x02, 0x45, 0x04, 0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42,
+    0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60,
+    0x12, 0x28, 0x20, 0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15,
+    0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00,
+    0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae,
+    0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04,
+    0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73,
+    0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24,
+    0x50, 0x40, 0x63, 0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99,
+    0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22,
+    0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf,
+    0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14,
+    0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f,
+    0x52, 0xf5, 0xee, 0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0,
+    0x80, 0xc6, 0x63, 0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa,
+    0x97, 0xaf, 0x75, 0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04,
+    0x06, 0x33, 0x1d, 0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4,
+    0xbd, 0x7b, 0xae, 0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20,
+    0x31, 0x98, 0xeb, 0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5,
+    0xeb, 0xdd, 0x74, 0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01,
+    0x8c, 0xc7, 0x5e, 0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f,
+    0x5e, 0xeb, 0xa3, 0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c,
+    0x66, 0x3a, 0xf1, 0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a,
+    0xf7, 0x5d, 0x1c, 0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63,
+    0x31, 0xd7, 0x88, 0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7,
+    0xba, 0xe8, 0xe4, 0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19,
+    0x8e, 0xbc, 0x46, 0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd,
+    0xd7, 0x47, 0x24, 0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc,
+    0x75, 0xe2, 0x32, 0x6f, 0xd0, 0x59, 0x0a, 0xe6, 0x7f, 0x52, 0xf5, 0xee,
+    0xba, 0x39, 0x22, 0xac, 0x3f, 0x99, 0x80, 0x48, 0xa0, 0x80, 0xc6, 0x63,
+    0xaf, 0x11, 0x93, 0x7e, 0x82, 0xc8, 0x57, 0x33, 0xfa, 0x97, 0xaf, 0x75,
+    0xd1, 0xc9, 0x15, 0x61, 0xfc, 0xcc, 0x02, 0x45, 0x04, 0x06, 0x33, 0x1d,
+    0x78, 0x8c, 0x9b, 0xf4, 0x16, 0x42, 0xb9, 0x9f, 0xd4, 0xbd, 0x7b, 0xae,
+    0x8e, 0x48, 0xab, 0x0f, 0xe6, 0x60, 0x12, 0x28, 0x20, 0x31, 0x98, 0xeb,
+    0xc4, 0x64, 0xdf, 0xa0, 0xb2, 0x15, 0xcc, 0xfe, 0xa5, 0xeb, 0xdd, 0x74,
+    0x72, 0x45, 0x58, 0x7f, 0x33, 0x00, 0x91, 0x41, 0x01, 0x8c, 0xc7, 0x5e,
+    0x23, 0x26, 0xfd, 0x05, 0x90, 0xae, 0x67, 0xf5, 0x2f, 0x5e, 0xeb, 0xa3,
+    0x92, 0x2a, 0xc3, 0xf9, 0x98, 0x04, 0x8a, 0x08, 0x0c, 0x66, 0x3a, 0xf1,
+    0x19, 0x37, 0xe8, 0x2c, 0x85, 0x73, 0x3f, 0xa9, 0x7a, 0xf7, 0x5d, 0x1c,
+    0x91, 0x56, 0x1f, 0xcc, 0xc0, 0x24, 0x50, 0x40, 0x63, 0x31, 0xd7, 0x88,
+    0xc9, 0xbf, 0x41, 0x64, 0x2b, 0x99, 0xfd, 0x4b, 0xd7, 0xba, 0xe8, 0xe4,
+    0x8a, 0xb0, 0xfe, 0x66, 0x01, 0x22, 0x82, 0x03, 0x19, 0x8e, 0xbc, 0x46,
+    0x4d, 0xfa, 0x0b, 0x21, 0x5c, 0xcf, 0xea, 0x5e, 0xbd, 0xd7, 0x47, 0x24,
+    0x55, 0x87, 0xf3, 0x30, 0x09, 0x14, 0x10, 0x18, 0xcc, 0x75, 0xe2, 0x32,
+    0x6f, 0xd0, 0xc0,
+};
+static_assert(sizeof(kBytesTestReadSymbol8) == kNumBytesTestReadSymbol8, "");
+
+// The kBytesTestReadSymbol9[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][10] = {
+//   // pmf: 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9, 1/9
+//   { 32768 - 3641, 32768 - 7282, 32768 - 10923, 32768 - 14564, 32768 - 18204,
+//     32768 - 21845, 32768 - 25486, 32768 - 29127, 0, 0 },
+//   // pmf: 3/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 1/18
+//   { 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384, 32768 - 20025,
+//     32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+//   // pmf: 1/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 2/18, 3/18
+//   { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 16384,
+//     32768 - 20025, 32768 - 23666, 32768 - 27307, 0, 0 },
+//   // pmf: 1/18, 2/18, 2/18, 2/18, 4/18, 2/18, 2/18, 2/18, 1/18
+//   { 32768 - 1820, 32768 - 5461, 32768 - 9102, 32768 - 12743, 32768 - 20025,
+//     32768 - 23666, 32768 - 27307, 32768 - 30948, 0, 0 },
+// };
+// constexpr int kSymbols[18][4] = { { 0, 4, 8, 3 },  //
+//                                   { 1, 5, 7, 2 },  //
+//                                   { 2, 6, 6, 1 },  //
+//                                   { 3, 7, 5, 0 },  //
+//                                   { 4, 8, 4, 8 },  //
+//                                   { 5, 0, 3, 7 },  //
+//                                   { 6, 1, 2, 6 },  //
+//                                   { 7, 2, 1, 5 },  //
+//                                   { 8, 3, 0, 4 },  //
+//                                   { 0, 0, 8, 7 },  //
+//                                   { 2, 1, 6, 5 },  //
+//                                   { 4, 3, 4, 3 },  //
+//                                   { 6, 5, 2, 1 },  //
+//                                   { 8, 7, 7, 6 },  //
+//                                   { 1, 0, 5, 4 },  //
+//                                   { 3, 2, 3, 2 },  //
+//                                   { 5, 4, 1, 4 },  //
+//                                   { 7, 6, 8, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 128; ++i) {
+//   for (int j = 0; j < 18; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 9);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol9 = 3650;
+constexpr uint8_t kBytesTestReadSymbol9[] = {
+    0x10, 0xe6, 0x62, 0x17, 0x4c, 0x5e, 0xe0, 0x8c, 0x41, 0x75, 0x38, 0xda,
+    0xb6, 0x33, 0xc7, 0x0e, 0x0f, 0x62, 0x87, 0x29, 0xbe, 0x28, 0x8b, 0x81,
+    0x71, 0xab, 0x0d, 0xfe, 0x61, 0xf9, 0x96, 0x85, 0xfe, 0x78, 0x18, 0xe6,
+    0x57, 0xa7, 0xf0, 0xd3, 0xd5, 0x62, 0x37, 0x9a, 0x3d, 0xc4, 0xad, 0x75,
+    0x35, 0xc1, 0xe9, 0x63, 0xeb, 0x9c, 0xd3, 0xf4, 0xdb, 0xc0, 0xf3, 0x67,
+    0x14, 0xbd, 0xde, 0xf7, 0xd1, 0x51, 0xf1, 0x62, 0x28, 0xd5, 0x39, 0x99,
+    0x82, 0x5b, 0x9c, 0x3a, 0x37, 0x85, 0xe7, 0x48, 0x28, 0x02, 0x2d, 0xf1,
+    0x15, 0x55, 0x77, 0x02, 0x2e, 0x62, 0x53, 0xf6, 0x8a, 0x53, 0x44, 0xfa,
+    0xe0, 0xff, 0x05, 0xae, 0xdc, 0x30, 0xee, 0x36, 0x29, 0x80, 0xd5, 0x0a,
+    0xa6, 0x5f, 0x53, 0xa2, 0x31, 0xc0, 0x5b, 0x2a, 0xa5, 0xa5, 0xd2, 0xc0,
+    0x8d, 0x96, 0x66, 0x25, 0x93, 0x9e, 0xdc, 0x0b, 0x2f, 0xea, 0xe2, 0x51,
+    0x0b, 0x12, 0x87, 0x90, 0x79, 0xe7, 0x8e, 0x6f, 0xc6, 0x99, 0x4b, 0x6a,
+    0x50, 0x06, 0xf3, 0x3d, 0xf5, 0x25, 0x72, 0xc5, 0x9e, 0xab, 0x7b, 0x5b,
+    0x15, 0xf5, 0xeb, 0xae, 0x02, 0xe4, 0x90, 0x2b, 0x15, 0x66, 0xf7, 0x50,
+    0xfa, 0x46, 0x74, 0xae, 0xd4, 0x7f, 0xd4, 0x0b, 0xbf, 0xbc, 0x83, 0x60,
+    0x6f, 0x25, 0x87, 0xde, 0xce, 0xb3, 0x86, 0x5a, 0x13, 0x00, 0x31, 0xf2,
+    0x75, 0xca, 0x08, 0x71, 0xd2, 0xf4, 0xa9, 0xf9, 0x40, 0x23, 0xa7, 0x5e,
+    0x50, 0x63, 0x64, 0x1d, 0xa2, 0x50, 0x2f, 0x01, 0x4c, 0x11, 0x8b, 0xcb,
+    0x92, 0x40, 0x9d, 0x94, 0x50, 0x0a, 0xf5, 0x3b, 0xfc, 0x32, 0x1a, 0xbd,
+    0x48, 0x73, 0xe7, 0x93, 0x0f, 0x53, 0xb2, 0x8e, 0xac, 0xef, 0x22, 0x2f,
+    0x3e, 0xb0, 0x81, 0xc0, 0x06, 0x9b, 0x14, 0x5c, 0xa6, 0x16, 0xca, 0xa5,
+    0x79, 0xd2, 0x6a, 0xd3, 0xfe, 0x93, 0x33, 0x2f, 0xdb, 0xcb, 0xca, 0xb3,
+    0x1d, 0xc5, 0x56, 0x65, 0x53, 0x7f, 0xb9, 0x41, 0xe1, 0x54, 0x31, 0xa2,
+    0x8c, 0x92, 0xc8, 0x04, 0xf7, 0x9d, 0x26, 0xad, 0x35, 0x00, 0x5a, 0xb2,
+    0x78, 0x43, 0x14, 0xc2, 0xeb, 0x3a, 0x26, 0x4d, 0x49, 0x5d, 0x33, 0xe4,
+    0xa9, 0xea, 0xd3, 0x67, 0xbf, 0xbc, 0xb6, 0x2e, 0x1c, 0xf7, 0xd0, 0x98,
+    0x13, 0x0d, 0x7c, 0x94, 0x02, 0x28, 0x3e, 0x8a, 0xe5, 0x0c, 0x75, 0x82,
+    0xe5, 0x81, 0x98, 0x87, 0x88, 0x97, 0x86, 0xd6, 0x46, 0x2c, 0x9c, 0x85,
+    0xc2, 0x99, 0xfd, 0x0a, 0x68, 0xbf, 0x67, 0xfc, 0x17, 0xc7, 0x11, 0x54,
+    0xd1, 0x20, 0x9d, 0x83, 0x52, 0x84, 0x5d, 0x4b, 0x62, 0xbf, 0x16, 0x5d,
+    0x8e, 0x72, 0x46, 0xde, 0xb1, 0x77, 0xfb, 0x39, 0x98, 0xf0, 0x4d, 0xa6,
+    0x7a, 0x7d, 0x1c, 0x16, 0xe9, 0x1e, 0x86, 0x7e, 0xf9, 0x22, 0x58, 0x93,
+    0xea, 0x2e, 0x26, 0xc7, 0xfb, 0xd1, 0xb3, 0xc7, 0x99, 0xb1, 0x91, 0x67,
+    0xf1, 0xa3, 0xe0, 0xd2, 0xe8, 0x17, 0x17, 0xd7, 0x0b, 0x7a, 0xd4, 0xed,
+    0x9e, 0x72, 0x4e, 0xa2, 0x37, 0xc9, 0xd2, 0x16, 0x5d, 0x8b, 0xda, 0xdb,
+    0x5c, 0x46, 0x05, 0x3e, 0xf7, 0xc8, 0x3a, 0xd5, 0xaf, 0xd9, 0x72, 0x82,
+    0xbf, 0x96, 0xea, 0x09, 0xd3, 0xd5, 0xfe, 0x43, 0x24, 0xae, 0x95, 0x3d,
+    0x6c, 0x68, 0x54, 0xad, 0xb5, 0xc4, 0x60, 0x54, 0x08, 0x3c, 0x57, 0x61,
+    0xa1, 0x11, 0x21, 0x7f, 0xca, 0x48, 0x59, 0xb4, 0x1c, 0x39, 0x0d, 0xf2,
+    0xdc, 0x62, 0xf0, 0xbb, 0x95, 0x39, 0x51, 0xe9, 0xdb, 0xf1, 0x5d, 0xd1,
+    0x43, 0x83, 0x8a, 0xb1, 0x8d, 0x36, 0x39, 0x83, 0xc6, 0x94, 0x30, 0xbe,
+    0xb6, 0x2f, 0x39, 0x05, 0xad, 0xcd, 0xf9, 0x4c, 0xc2, 0x34, 0xc7, 0x81,
+    0x68, 0xb1, 0x20, 0x1d, 0xea, 0xd3, 0x8c, 0xca, 0xff, 0x4d, 0x94, 0xe1,
+    0x3e, 0xc2, 0x74, 0x90, 0xed, 0x56, 0x3c, 0x1b, 0x5b, 0xf6, 0x40, 0xf9,
+    0x3b, 0x94, 0x94, 0x23, 0xc6, 0x48, 0x6a, 0x59, 0xef, 0x04, 0xb7, 0x9f,
+    0x55, 0x9c, 0x6f, 0x81, 0x73, 0xec, 0x27, 0x49, 0x0e, 0xd5, 0x63, 0xc1,
+    0xb5, 0xbf, 0x64, 0x0f, 0x93, 0xb9, 0x49, 0x42, 0x3c, 0x64, 0x86, 0xa5,
+    0x9e, 0xf0, 0x4b, 0x79, 0xf5, 0x59, 0xc7, 0xc5, 0x01, 0x6f, 0xbd, 0x6a,
+    0x66, 0x93, 0x99, 0x47, 0xb6, 0xf7, 0xfa, 0x21, 0x72, 0x81, 0x71, 0x40,
+    0x36, 0x81, 0xde, 0x5d, 0xdf, 0xdf, 0x30, 0x53, 0x03, 0x70, 0xfb, 0xb2,
+    0x2d, 0x37, 0xeb, 0x19, 0xbc, 0xd2, 0x90, 0x44, 0x25, 0x42, 0x06, 0x30,
+    0xc8, 0xcf, 0x4b, 0x0a, 0x01, 0x13, 0x5e, 0x17, 0x91, 0xc7, 0xcb, 0x79,
+    0xed, 0x06, 0x39, 0xc1, 0x2e, 0x92, 0x29, 0xf5, 0xff, 0x24, 0xe7, 0x2b,
+    0x3f, 0x19, 0x35, 0x6b, 0x3d, 0x69, 0xa2, 0x19, 0x20, 0x53, 0xd4, 0xca,
+    0x08, 0x35, 0x6e, 0xe0, 0x5a, 0x9a, 0x9d, 0x48, 0xf5, 0x20, 0x24, 0x20,
+    0x33, 0x94, 0x6b, 0x33, 0xdd, 0x78, 0xbf, 0x62, 0xf1, 0x43, 0x08, 0x97,
+    0x53, 0x98, 0xe4, 0x17, 0x27, 0xfc, 0xe8, 0xf1, 0xb8, 0x4c, 0xb3, 0x79,
+    0xc8, 0x05, 0x21, 0x1b, 0xe8, 0x56, 0xd2, 0x5f, 0xb6, 0x90, 0x14, 0x0c,
+    0x96, 0x38, 0xc6, 0xc3, 0x6d, 0x10, 0xbf, 0xc6, 0x28, 0xfe, 0x1f, 0x13,
+    0x81, 0x04, 0xeb, 0x37, 0x9c, 0x80, 0x52, 0x47, 0x0f, 0xa0, 0x6e, 0xcd,
+    0x9c, 0x44, 0xdd, 0x61, 0x9c, 0x8f, 0xb2, 0xf5, 0xe0, 0xa0, 0x2b, 0x2f,
+    0xe7, 0x67, 0xd0, 0xd7, 0x29, 0x08, 0x72, 0xee, 0xd5, 0x60, 0xb9, 0xbb,
+    0x1b, 0x12, 0xce, 0x60, 0x98, 0xb9, 0x40, 0xd3, 0xd9, 0x77, 0x5d, 0x6b,
+    0x78, 0xaa, 0x9a, 0x47, 0x2a, 0xf5, 0x38, 0xbb, 0xbe, 0x3a, 0x82, 0x6a,
+    0xbf, 0x8b, 0x67, 0x7e, 0xa4, 0x78, 0xbf, 0xcf, 0x58, 0xce, 0x86, 0x2e,
+    0x34, 0xb7, 0x76, 0x99, 0xa5, 0xf1, 0x0c, 0xa9, 0x1c, 0x9f, 0xad, 0xcb,
+    0xac, 0xf4, 0x03, 0x60, 0xe0, 0x22, 0xfe, 0x02, 0x34, 0x9a, 0x14, 0xb9,
+    0x11, 0xea, 0x4c, 0x3a, 0x59, 0xaa, 0xec, 0x8f, 0x82, 0x49, 0x23, 0xa2,
+    0xd0, 0xf7, 0xc3, 0xf0, 0xaa, 0x2d, 0xb2, 0xb8, 0xce, 0x02, 0x2f, 0xe0,
+    0x23, 0x49, 0xa1, 0x38, 0x12, 0xba, 0xab, 0x9f, 0x60, 0xe4, 0x0d, 0xfa,
+    0x2b, 0xcc, 0xad, 0x6a, 0x06, 0xca, 0x38, 0x82, 0xc5, 0x88, 0x10, 0xb6,
+    0xf5, 0xf6, 0x06, 0x7b, 0x03, 0x9c, 0xe4, 0x89, 0xaf, 0xdb, 0x66, 0x45,
+    0xeb, 0x2c, 0x28, 0xe2, 0x40, 0x08, 0x44, 0xe2, 0x8a, 0x91, 0x19, 0x04,
+    0x29, 0x46, 0xa7, 0xb5, 0x78, 0xae, 0x05, 0xcc, 0x38, 0x9f, 0xd8, 0x58,
+    0xc9, 0x79, 0xf9, 0xad, 0x77, 0x66, 0x49, 0x62, 0xef, 0x13, 0x72, 0xee,
+    0xda, 0x37, 0xb5, 0xd7, 0xf1, 0x51, 0x5d, 0x16, 0x11, 0xf3, 0x91, 0xf2,
+    0x13, 0x49, 0x09, 0x50, 0x15, 0xc6, 0x48, 0xe6, 0xe9, 0x4c, 0xf0, 0x06,
+    0x14, 0x3f, 0xef, 0x46, 0x15, 0xaf, 0x96, 0x0d, 0x17, 0x51, 0x08, 0xf2,
+    0xe1, 0xc9, 0xb9, 0x1d, 0x8d, 0x8f, 0x74, 0x25, 0x04, 0x1f, 0x2c, 0x62,
+    0x67, 0xe4, 0x4b, 0xdc, 0x67, 0x39, 0x2c, 0x7d, 0x3a, 0x1e, 0x6f, 0x5b,
+    0x0b, 0xab, 0x0b, 0x1f, 0x64, 0x37, 0x19, 0x4f, 0x6b, 0x07, 0x05, 0xff,
+    0x6e, 0x89, 0x8f, 0x22, 0x7d, 0x28, 0xd9, 0x3b, 0x9a, 0xe2, 0x3f, 0xff,
+    0xc2, 0xb1, 0xca, 0x05, 0xbc, 0x05, 0xa5, 0xe7, 0x2d, 0x66, 0xf7, 0x37,
+    0x92, 0xd2, 0xb4, 0x35, 0x26, 0x3f, 0x8c, 0x0c, 0x22, 0xa5, 0x5f, 0x5e,
+    0x9c, 0x01, 0x46, 0x91, 0xe7, 0xa2, 0x92, 0x97, 0x0a, 0x19, 0x85, 0x2f,
+    0x54, 0xe3, 0xa8, 0x26, 0xab, 0xe6, 0xb5, 0xd9, 0x71, 0x19, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd,
+    0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f,
+    0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e,
+    0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe,
+    0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69,
+    0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83,
+    0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e,
+    0x00, 0xac, 0x09, 0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7,
+    0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e,
+    0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41,
+    0x11, 0xea, 0x4b, 0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb,
+    0x0b, 0xe2, 0x7d, 0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09,
+    0x06, 0x99, 0x9b, 0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8,
+    0xf8, 0x39, 0xb8, 0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03,
+    0x60, 0xe0, 0x0a, 0xc0, 0x90, 0x69, 0x99, 0xb7, 0x41, 0x11, 0xea, 0x4b,
+    0x6e, 0x7e, 0xdd, 0xfa, 0x8f, 0x83, 0x9b, 0x8a, 0xeb, 0x0b, 0xe2, 0x7d,
+    0xe6, 0xee, 0x8f, 0x40, 0x36, 0x0e, 0x00, 0xac, 0x09, 0x06, 0x99, 0x9b,
+    0x74, 0x11, 0x1e, 0xa4, 0xb6, 0xe7, 0xed, 0xdf, 0xa8, 0xf8, 0x39, 0xb8,
+    0xae, 0xb0, 0xbe, 0x27, 0xde, 0x6e, 0xe8, 0xf4, 0x03, 0x60, 0xe0, 0x0a,
+    0xc0, 0x98,
+};
+static_assert(sizeof(kBytesTestReadSymbol9) == kNumBytesTestReadSymbol9, "");
+
+// The kBytesTestReadSymbol10[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][11] = {
+//   // pmf: 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10, 1/10
+//   { 32768 - 3277, 32768 - 6554, 32768 - 9830, 32768 - 13107, 32768 - 16384,
+//     32768 - 19661, 32768 - 22938, 32768 - 26214, 32768 - 29491, 0, 0 },
+//   // pmf: 3/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 1/20
+//   { 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746, 32768 - 18022,
+//     32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+//   // pmf: 1/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 2/20, 3/20
+//   { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 14746,
+//     32768 - 18022, 32768 - 21299, 32768 - 24576, 32768 - 27853, 0, 0 },
+//   // pmf: 1/20, 2/20, 2/20, 2/20, 3/20, 3/20, 2/20, 2/20, 2/20, 1/20
+//   { 32768 - 1638, 32768 - 4915, 32768 - 8192, 32768 - 11469, 32768 - 16384,
+//     32768 - 21299, 32768 - 24576, 32768 - 27853, 32768 - 31130, 0, 0 },
+// };
+// constexpr int kSymbols[20][4] = { { 0, 5, 9, 4 },  //
+//                                   { 1, 6, 8, 3 },  //
+//                                   { 2, 7, 7, 2 },  //
+//                                   { 3, 8, 6, 1 },  //
+//                                   { 4, 9, 5, 0 },  //
+//                                   { 5, 0, 4, 9 },  //
+//                                   { 6, 1, 3, 8 },  //
+//                                   { 7, 2, 2, 7 },  //
+//                                   { 8, 3, 1, 6 },  //
+//                                   { 9, 4, 0, 5 },  //
+//                                   { 0, 0, 9, 7 },  //
+//                                   { 2, 1, 8, 5 },  //
+//                                   { 4, 3, 6, 3 },  //
+//                                   { 6, 5, 4, 1 },  //
+//                                   { 8, 7, 2, 8 },  //
+//                                   { 1, 0, 9, 6 },  //
+//                                   { 3, 2, 7, 4 },  //
+//                                   { 5, 4, 5, 2 },  //
+//                                   { 7, 6, 3, 5 },  //
+//                                   { 9, 8, 1, 4 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+//   for (int j = 0; j < 20; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 10);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol10 = 3204;
+constexpr uint8_t kBytesTestReadSymbol10[] = {
+    0x10, 0x84, 0xe2, 0xe0, 0x0f, 0x08, 0xd6, 0x01, 0xd0, 0xaa, 0xd8, 0xb5,
+    0x60, 0x4f, 0xb9, 0xb3, 0x73, 0x01, 0x8c, 0x92, 0xe6, 0xa0, 0xab, 0xe8,
+    0xe4, 0x95, 0x85, 0x03, 0x5f, 0xbb, 0x3b, 0x1f, 0x27, 0xb1, 0x44, 0x95,
+    0x50, 0x1f, 0xad, 0xc8, 0x35, 0xde, 0x44, 0xf3, 0xb6, 0x8d, 0xa2, 0x39,
+    0xc3, 0xb6, 0xee, 0x3c, 0x10, 0x33, 0x27, 0x7a, 0x29, 0xcc, 0x7c, 0x08,
+    0xcb, 0x94, 0xbe, 0xef, 0x96, 0x47, 0x30, 0x49, 0x47, 0x9c, 0xb7, 0x7e,
+    0x23, 0x0c, 0x27, 0x8e, 0x1b, 0xdc, 0x6c, 0x92, 0x40, 0x98, 0xbf, 0x20,
+    0xd4, 0x01, 0x72, 0x55, 0x8c, 0x3f, 0x3c, 0x76, 0x24, 0xd2, 0x2d, 0xba,
+    0xa4, 0x54, 0x29, 0x80, 0xe9, 0x06, 0x2c, 0x68, 0xbd, 0xa7, 0xc5, 0xf7,
+    0x44, 0xdf, 0x7e, 0x94, 0x90, 0x3f, 0x94, 0x7d, 0x9e, 0x36, 0xb8, 0x82,
+    0x1d, 0x4a, 0x47, 0x1f, 0x6c, 0x29, 0x51, 0xd2, 0x84, 0xa8, 0xcd, 0x98,
+    0xc0, 0xd2, 0xea, 0x4a, 0x25, 0x3c, 0xd7, 0x34, 0x64, 0x96, 0xd4, 0x06,
+    0xed, 0x00, 0x98, 0xc3, 0x65, 0x10, 0xd4, 0xac, 0x6b, 0xab, 0xd7, 0x35,
+    0x04, 0x89, 0xbf, 0x24, 0xcc, 0xfc, 0xc9, 0xe8, 0x87, 0x3d, 0xdb, 0x55,
+    0xf0, 0xc9, 0x97, 0x71, 0x99, 0x00, 0x54, 0x50, 0x24, 0x66, 0xca, 0x24,
+    0xfd, 0x1c, 0xb1, 0x71, 0x0e, 0xb5, 0x9c, 0x27, 0xfc, 0x7f, 0x95, 0x98,
+    0xc8, 0x99, 0x9f, 0x9b, 0xc7, 0xf6, 0x69, 0xfa, 0xb2, 0x11, 0x77, 0x8d,
+    0x02, 0x53, 0x32, 0x4e, 0x20, 0x2c, 0x21, 0x2b, 0x99, 0x9a, 0xec, 0x63,
+    0x0b, 0xe2, 0x8f, 0x30, 0xf8, 0x3c, 0xd1, 0xb1, 0xbc, 0x52, 0x73, 0xce,
+    0x85, 0x54, 0xdd, 0xe6, 0xf6, 0x9c, 0x2d, 0xca, 0x3d, 0xa8, 0x09, 0x34,
+    0xa8, 0x41, 0x9c, 0x03, 0x78, 0xbc, 0x67, 0x11, 0x9f, 0xbe, 0xde, 0x9a,
+    0x98, 0x8a, 0x8d, 0x0b, 0x88, 0x7f, 0xea, 0x82, 0x77, 0x61, 0x7a, 0xde,
+    0xb0, 0xb1, 0x46, 0x8d, 0x23, 0x69, 0x2f, 0x17, 0x05, 0xff, 0x4a, 0x9e,
+    0xf9, 0xb3, 0x9a, 0xd0, 0xc4, 0x81, 0xcf, 0xbc, 0xe6, 0x26, 0x2c, 0x37,
+    0x55, 0xec, 0xdc, 0x23, 0x05, 0xdf, 0x30, 0xcf, 0x5a, 0x4a, 0x0c, 0x08,
+    0xc0, 0xd7, 0x9d, 0x80, 0xc0, 0xa3, 0x56, 0x49, 0x41, 0xc4, 0xdd, 0xc5,
+    0x69, 0x5c, 0xe5, 0x6c, 0xc5, 0xae, 0x4c, 0x95, 0x45, 0xf2, 0xf6, 0xd6,
+    0x12, 0x25, 0xcc, 0x24, 0x56, 0x8c, 0x2b, 0x32, 0x51, 0x18, 0x1a, 0xec,
+    0xb0, 0x62, 0x40, 0x82, 0x59, 0xb8, 0x38, 0x9f, 0x9f, 0x73, 0xf5, 0xb3,
+    0xc3, 0x93, 0xa5, 0x4e, 0xab, 0x7f, 0x97, 0x56, 0x51, 0xb0, 0xff, 0x69,
+    0x73, 0xc2, 0xd0, 0x60, 0x93, 0x59, 0x2f, 0xc7, 0x84, 0x14, 0x7e, 0x68,
+    0xa7, 0x2b, 0x37, 0xb4, 0x2e, 0x69, 0x58, 0x55, 0x3c, 0xd2, 0xf1, 0xa8,
+    0x2b, 0x6e, 0xd5, 0x11, 0x1c, 0x1d, 0x17, 0xd5, 0xf1, 0xfa, 0x8b, 0xd1,
+    0x6c, 0xc2, 0x32, 0x9e, 0x66, 0x3e, 0x6a, 0x4a, 0x0e, 0xb8, 0xf9, 0xa8,
+    0x1c, 0x23, 0xb1, 0x7e, 0xe7, 0xa0, 0x27, 0x5b, 0x1e, 0x8f, 0x8a, 0xb1,
+    0x1e, 0x50, 0x99, 0x9c, 0x39, 0x5b, 0xa0, 0x76, 0xa2, 0x90, 0x20, 0xd5,
+    0x61, 0xf8, 0x96, 0x5a, 0xbc, 0x91, 0x5d, 0xfc, 0x1e, 0xed, 0xea, 0xd8,
+    0x10, 0x5d, 0x15, 0xfa, 0x2b, 0xa7, 0x77, 0xaf, 0xae, 0x64, 0xef, 0x06,
+    0xa4, 0xf7, 0x65, 0x58, 0xb8, 0x64, 0x47, 0xcd, 0xfa, 0x12, 0x8e, 0x7d,
+    0x5b, 0x96, 0x27, 0xda, 0xb9, 0x2a, 0x14, 0xfe, 0x3e, 0x57, 0xd7, 0x4e,
+    0x86, 0xb3, 0x36, 0xd7, 0x77, 0x2d, 0xf6, 0x1e, 0xf3, 0xfd, 0xdb, 0x9a,
+    0x92, 0x78, 0x0a, 0xa4, 0x17, 0xf1, 0x78, 0xfc, 0xc3, 0x6d, 0xa0, 0xf8,
+    0x07, 0x6a, 0x68, 0xb1, 0x1b, 0x00, 0x27, 0x65, 0x68, 0x76, 0x10, 0x39,
+    0x4b, 0x8a, 0x51, 0x7a, 0x53, 0x69, 0x79, 0xfc, 0xbc, 0xe6, 0xf4, 0x26,
+    0xc3, 0xbf, 0x3a, 0x64, 0x56, 0x7d, 0x5f, 0x76, 0xa2, 0x42, 0xd1, 0xad,
+    0x3f, 0xb8, 0xce, 0xfb, 0x79, 0x38, 0xf3, 0x85, 0x2a, 0x67, 0xf4, 0x71,
+    0xfe, 0x0b, 0x79, 0xee, 0x85, 0xe0, 0x61, 0x9c, 0x9d, 0xd5, 0xe0, 0x0a,
+    0xd7, 0xa6, 0x21, 0xc3, 0x60, 0xbf, 0xbd, 0x16, 0xca, 0xa0, 0x16, 0x9d,
+    0xc4, 0x14, 0x99, 0x03, 0x7e, 0xe6, 0x62, 0x6e, 0xbe, 0x18, 0x45, 0x5e,
+    0x15, 0x42, 0xac, 0x5b, 0x60, 0x9f, 0xbd, 0x1e, 0x8a, 0x58, 0x55, 0x75,
+    0xcf, 0xbb, 0x12, 0xcb, 0xc2, 0xf4, 0x01, 0xfc, 0x96, 0x8d, 0x97, 0x67,
+    0x94, 0x65, 0x6b, 0xd0, 0xeb, 0xff, 0x26, 0x30, 0x3a, 0xa0, 0xe9, 0x9b,
+    0xa7, 0x5e, 0x81, 0x2b, 0x8e, 0xf7, 0xd6, 0xbf, 0x6f, 0xe4, 0x33, 0xd5,
+    0xaa, 0x5a, 0x27, 0x18, 0x24, 0x76, 0x72, 0x72, 0x50, 0x72, 0x92, 0x88,
+    0x9f, 0x88, 0x81, 0x0f, 0x33, 0xa7, 0x99, 0x83, 0x53, 0x03, 0x8c, 0x2d,
+    0x36, 0x43, 0x52, 0x27, 0x27, 0x74, 0xcd, 0xf1, 0x1b, 0x76, 0x95, 0x11,
+    0xdf, 0x4e, 0xb3, 0xa5, 0x2e, 0xe4, 0xac, 0x3a, 0xfd, 0x9f, 0xab, 0x96,
+    0x7e, 0xb1, 0xf0, 0x19, 0x22, 0xc4, 0x06, 0x9b, 0xe7, 0xe2, 0xf8, 0xb4,
+    0x17, 0xbd, 0x9d, 0x14, 0xac, 0x11, 0xc9, 0x79, 0x8e, 0x01, 0x23, 0xc9,
+    0x6e, 0x5f, 0x96, 0x1e, 0x99, 0xe1, 0x19, 0x2c, 0xb1, 0x1b, 0x54, 0x30,
+    0x3a, 0xb1, 0xe7, 0xbf, 0xbf, 0x17, 0x3d, 0x9b, 0x86, 0xd7, 0x4b, 0x68,
+    0x46, 0xa6, 0xb0, 0x05, 0x66, 0x4b, 0x8a, 0xdc, 0x60, 0x60, 0x29, 0x95,
+    0x35, 0x4b, 0x6f, 0xf5, 0x73, 0x51, 0x52, 0xb6, 0xec, 0xef, 0x74, 0xcb,
+    0x0b, 0x00, 0x04, 0x15, 0xff, 0xb3, 0x13, 0xdd, 0x70, 0x5e, 0x65, 0xfc,
+    0xa6, 0xb1, 0x13, 0x59, 0x29, 0xd0, 0x2e, 0xc4, 0x55, 0xcb, 0x99, 0xac,
+    0xca, 0x48, 0x67, 0x3e, 0xfb, 0xfb, 0x54, 0xb7, 0x53, 0x32, 0xb4, 0x17,
+    0xf6, 0x78, 0xd1, 0x64, 0x67, 0x76, 0x33, 0x3a, 0xe9, 0x13, 0x8c, 0x9c,
+    0xf1, 0x74, 0xb7, 0xd1, 0x35, 0x41, 0xf2, 0x4d, 0x68, 0x53, 0x25, 0x57,
+    0x97, 0x33, 0x18, 0xea, 0x96, 0xea, 0x66, 0x56, 0x82, 0xfe, 0xcf, 0x1a,
+    0x2c, 0x8c, 0xee, 0xc6, 0x67, 0x5d, 0x22, 0x71, 0x93, 0x9e, 0x2e, 0x96,
+    0xfa, 0x26, 0xa8, 0x3e, 0x49, 0xad, 0x0a, 0x64, 0xaa, 0xf2, 0xe6, 0x63,
+    0x1d, 0x52, 0xfb, 0x67, 0x7e, 0x17, 0x91, 0x70, 0xef, 0x48, 0xe1, 0x2e,
+    0x48, 0xe4, 0x8a, 0xc2, 0x4c, 0x5f, 0x77, 0x7f, 0x03, 0x45, 0xf0, 0x8d,
+    0x44, 0xad, 0x1e, 0xef, 0xb5, 0x1f, 0x3c, 0x3c, 0x4e, 0x43, 0x87, 0xdd,
+    0xec, 0xd9, 0x6e, 0xd0, 0xe8, 0x47, 0x75, 0x5b, 0xe5, 0xc0, 0x76, 0xb1,
+    0x9c, 0x5b, 0x72, 0xeb, 0x15, 0x9c, 0x5a, 0xa1, 0x31, 0xc2, 0x46, 0xb4,
+    0xe7, 0x9b, 0x5d, 0x86, 0x23, 0x3f, 0x47, 0xd9, 0x9b, 0x31, 0x4e, 0xa6,
+    0x65, 0xe9, 0x2f, 0xa3, 0xf8, 0x34, 0x68, 0xf7, 0x61, 0xf5, 0x08, 0xc4,
+    0x8a, 0x10, 0xa1, 0x9b, 0xa9, 0x30, 0x25, 0x8d, 0xaf, 0x67, 0x07, 0x8e,
+    0x84, 0x62, 0xa5, 0xc3, 0x2f, 0x5d, 0x06, 0xaa, 0xd4, 0x02, 0x04, 0x77,
+    0xed, 0xf4, 0xe0, 0xa9, 0xca, 0x95, 0xa2, 0x91, 0xe0, 0x56, 0x64, 0xb6,
+    0xb8, 0x39, 0xda, 0x83, 0xc5, 0x10, 0x7e, 0xa6, 0x08, 0x10, 0x01, 0x15,
+    0x2b, 0x6e, 0xce, 0xfe, 0x43, 0x01, 0xa9, 0xcb, 0xfd, 0xd9, 0x1b, 0x7e,
+    0x11, 0x74, 0x96, 0x4a, 0x89, 0x3f, 0x07, 0xac, 0x74, 0xf9, 0x93, 0xb2,
+    0xf6, 0xed, 0xb3, 0x29, 0xab, 0xc5, 0x0a, 0x90, 0xb3, 0x71, 0x51, 0xa5,
+    0xba, 0x16, 0x01, 0xd4, 0x35, 0x11, 0xdc, 0xba, 0x27, 0xc3, 0x01, 0x05,
+    0x65, 0x91, 0x6b, 0xff, 0x33, 0xb9, 0x9d, 0x84, 0xf7, 0xc0, 0x2d, 0x4b,
+    0xf4, 0xb2, 0x39, 0xe4, 0x7d, 0x0f, 0xf6, 0x8d, 0xa4, 0x2c, 0xa2, 0x4d,
+    0x4e, 0x8a, 0x2e, 0xff, 0x84, 0x5f, 0x43, 0x93, 0xa3, 0x43, 0xa2, 0xe3,
+    0x23, 0x92, 0xf3, 0x57, 0xd2, 0x2e, 0x8e, 0xea, 0xff, 0x2c, 0x3d, 0x1f,
+    0xc6, 0x94, 0x77, 0x19, 0xf6, 0xdb, 0x16, 0x4e, 0xd0, 0x3f, 0x32, 0xf3,
+    0x7b, 0x89, 0x50, 0xc5, 0x5c, 0xfe, 0x86, 0xcf, 0xf6, 0x89, 0x88, 0xa3,
+    0xa8, 0xd9, 0x52, 0x23, 0x68, 0x31, 0x90, 0xe2, 0xd4, 0x3a, 0x62, 0xb4,
+    0xe6, 0x4e, 0xfa, 0x20, 0x21, 0xbf, 0xe5, 0x4e, 0x86, 0x6d, 0xbe, 0xbe,
+    0xc6, 0x25, 0x4b, 0xf2, 0x20, 0x6c, 0x4e, 0xfc, 0x93, 0x41, 0x3f, 0x8b,
+    0x29, 0x34, 0xb9, 0xd1, 0x61, 0xe0, 0x34, 0x83, 0x8e, 0x1f, 0x8c, 0x44,
+    0xe2, 0x95, 0x2e, 0x73, 0x48, 0x8f, 0xeb, 0xd0, 0x6c, 0xec, 0xc4, 0xf6,
+    0x48, 0x5e, 0xf7, 0x53, 0x3e, 0xa6, 0x77, 0x33, 0xb0, 0x9e, 0xf8, 0x05,
+    0xa9, 0x7e, 0x96, 0x47, 0x3c, 0x8f, 0xa1, 0xfe, 0xd1, 0xb4, 0x85, 0x94,
+    0x49, 0xa9, 0xd1, 0x45, 0xdf, 0xf0, 0x8b, 0xe8, 0x72, 0x74, 0x68, 0x74,
+    0x5c, 0x67, 0xc2, 0xbb, 0xcd, 0x7b, 0x6a, 0x2f, 0x6b, 0x0a, 0x1d, 0xec,
+    0x03, 0x48, 0xd2, 0x8e, 0xe3, 0x3e, 0xdb, 0x62, 0xc9, 0xda, 0x07, 0xe6,
+    0x5e, 0x6f, 0x71, 0x2a, 0x18, 0xab, 0x9f, 0xd0, 0xd9, 0xfe, 0xd1, 0xac,
+    0xf0, 0x21, 0xab, 0xd9, 0x70, 0x1e, 0xb9, 0x99, 0xa0, 0xcc, 0xeb, 0xe7,
+    0x87, 0xee, 0xd9, 0x8e, 0xd0, 0xe5, 0xc0, 0x58, 0x75, 0x37, 0x3d, 0x03,
+    0x4e, 0x18, 0x08, 0x27, 0xdd, 0x18, 0x38, 0x1b, 0xad, 0xf1, 0xd3, 0xcc,
+    0xa1, 0x65, 0x26, 0x97, 0x3a, 0x2c, 0x3c, 0x06, 0x90, 0x71, 0xc3, 0xf1,
+    0x88, 0x9c, 0x52, 0xa5, 0xce, 0x69, 0x11, 0xfd, 0x7a, 0x0d, 0x9d, 0x98,
+    0x9e, 0xc9, 0x0b, 0xde, 0xea, 0x67, 0xd4, 0xce, 0xe6, 0x76, 0x13, 0xdf,
+    0x00, 0xb5, 0x2f, 0xd2, 0xc8, 0xe7, 0x91, 0xf4, 0x3f, 0xda, 0x36, 0x90,
+    0xb2, 0x89, 0x35, 0x3a, 0x28, 0xbb, 0xfe, 0x11, 0x7d, 0x0e, 0x4e, 0x8d,
+    0x0e, 0x8b, 0x8c, 0xf8, 0x57, 0x79, 0xaf, 0x6d, 0x45, 0xed, 0x61, 0x43,
+    0xbd, 0x80, 0x69, 0x1a, 0x51, 0xdc, 0x67, 0xdb, 0x6c, 0x59, 0x3b, 0x40,
+    0xfc, 0xcb, 0xcd, 0xee, 0x25, 0x43, 0x15, 0x73, 0xfa, 0x1b, 0x3f, 0xda,
+    0x35, 0x9e, 0x04, 0x35, 0x7b, 0x2e, 0x03, 0xd7, 0x33, 0x34, 0x19, 0x9d,
+    0x7c, 0xf0, 0xfd, 0xdb, 0x31, 0xda, 0x1c, 0xb8, 0x0b, 0x0e, 0xa6, 0xe7,
+    0xa0, 0x69, 0xc3, 0x01, 0x04, 0xfb, 0xa3, 0x07, 0x03, 0x75, 0xbe, 0x3a,
+    0x79, 0x94, 0x2c, 0xa4, 0xd2, 0xe7, 0x45, 0x87, 0x80, 0xd2, 0x0e, 0x38,
+    0x7e, 0x31, 0x13, 0x8a, 0x54, 0xb9, 0xcd, 0x22, 0x3f, 0xaf, 0x41, 0xb3,
+    0xb3, 0x13, 0xd9, 0x21, 0x7b, 0xdd, 0x4c, 0xfa, 0x99, 0xdc, 0xce, 0xc2,
+    0x7b, 0xe0, 0x16, 0xa5, 0xfa, 0x59, 0x1c, 0xf2, 0x3e, 0x87, 0xfb, 0x46,
+    0xd2, 0x16, 0x51, 0x26, 0xa7, 0x45, 0x17, 0x7f, 0xc2, 0x2f, 0xa1, 0xc9,
+    0xd1, 0xa1, 0xd1, 0x71, 0x9f, 0x0a, 0xef, 0x35, 0xed, 0xa8, 0xbd, 0xac,
+    0x28, 0x77, 0xb0, 0x0d, 0x23, 0x4a, 0x3b, 0x8c, 0xfb, 0x6d, 0x8b, 0x27,
+    0x68, 0x1f, 0x99, 0x79, 0xbd, 0xc4, 0xa8, 0x62, 0xae, 0x7f, 0x43, 0x67,
+    0xfb, 0x46, 0xb3, 0xc0, 0x86, 0xaf, 0x65, 0xc0, 0x7a, 0xe6, 0x66, 0x83,
+    0x33, 0xaf, 0x9e, 0x1f, 0xbb, 0x66, 0x3b, 0x43, 0x97, 0x01, 0x61, 0xd4,
+    0xdc, 0xf4, 0x0d, 0x38, 0x60, 0x20, 0x9f, 0x74, 0x60, 0xe0, 0x6e, 0xb7,
+    0xc7, 0x4f, 0x32, 0x85, 0x94, 0x9a, 0x5c, 0xe8, 0xb0, 0xf0, 0x1a, 0x41,
+    0xc7, 0x0f, 0xc6, 0x22, 0x71, 0x4a, 0x97, 0x39, 0xa4, 0x47, 0xf5, 0xe8,
+    0x36, 0x76, 0x62, 0x7b, 0x24, 0x2f, 0x7b, 0xa9, 0x9f, 0x53, 0x3b, 0x99,
+    0xd8, 0x4f, 0x7c, 0x02, 0xd4, 0xbf, 0x4b, 0x23, 0x9e, 0x47, 0xd0, 0xff,
+    0x68, 0xda, 0x42, 0xca, 0x24, 0xd4, 0xe8, 0xa2, 0xef, 0xf8, 0x45, 0xf4,
+    0x39, 0x3a, 0x34, 0x3a, 0x2e, 0x33, 0xe1, 0x5d, 0xe6, 0xbd, 0xb5, 0x17,
+    0xb5, 0x85, 0x0e, 0xf6, 0x01, 0xa4, 0x69, 0x47, 0x71, 0x9f, 0x6d, 0xb1,
+    0x64, 0xed, 0x03, 0xf3, 0x2f, 0x37, 0xb8, 0x95, 0x0c, 0x55, 0xcf, 0xe8,
+    0x6c, 0xff, 0x68, 0xd6, 0x78, 0x10, 0xd5, 0xec, 0xb8, 0x0f, 0x5c, 0xcc,
+    0xd0, 0x66, 0x75, 0xf3, 0xc3, 0xf7, 0x6c, 0xc7, 0x68, 0x72, 0xe0, 0x2c,
+    0x3a, 0x9b, 0x9e, 0x81, 0xa7, 0x0c, 0x04, 0x13, 0xee, 0x8c, 0x1c, 0x0d,
+    0xd6, 0xf8, 0xe9, 0xe6, 0x50, 0xb2, 0x93, 0x4b, 0x9d, 0x16, 0x1e, 0x03,
+    0x48, 0x38, 0xe1, 0xf8, 0xc4, 0x4e, 0x29, 0x52, 0xe7, 0x34, 0x88, 0xfe,
+    0xbd, 0x06, 0xce, 0xcc, 0x4f, 0x64, 0x85, 0xef, 0x75, 0x33, 0xea, 0x67,
+    0x73, 0x3b, 0x09, 0xef, 0x80, 0x5a, 0x97, 0xe9, 0x64, 0x73, 0xc8, 0xfa,
+    0x1f, 0xed, 0x1b, 0x48, 0x59, 0x44, 0x9a, 0x9d, 0x14, 0x5d, 0xff, 0x08,
+    0xbe, 0x87, 0x27, 0x46, 0x87, 0x45, 0xc6, 0x7c, 0x2b, 0xbc, 0xd7, 0xb6,
+    0xa2, 0xf6, 0xb0, 0xa1, 0xde, 0xc0, 0x34, 0x8d, 0x28, 0xee, 0x33, 0xed,
+    0xb6, 0x2c, 0x9d, 0xa0, 0x7e, 0x65, 0xe6, 0xf7, 0x12, 0xa1, 0x8a, 0xb9,
+    0xfd, 0x0d, 0x9f, 0xed, 0x1a, 0xcf, 0x02, 0x1a, 0xbd, 0x97, 0x01, 0xeb,
+    0x99, 0x9a, 0x0c, 0xce, 0xbe, 0x78, 0x7e, 0xed, 0x98, 0xed, 0x0e, 0x5c,
+    0x05, 0x87, 0x53, 0x73, 0xd0, 0x34, 0xe1, 0x80, 0x82, 0x7d, 0xd1, 0x83,
+    0x81, 0xba, 0xdf, 0x1d, 0x3c, 0xca, 0x16, 0x52, 0x69, 0x73, 0xa2, 0xc3,
+    0xc0, 0x69, 0x07, 0x1c, 0x3f, 0x18, 0x89, 0xc5, 0x2a, 0x5c, 0xe6, 0x91,
+    0x1f, 0xd7, 0xa0, 0xd9, 0xd9, 0x89, 0xec, 0x90, 0xbd, 0xee, 0xa6, 0x7d,
+    0x4c, 0xee, 0x67, 0x61, 0x3d, 0xf0, 0x0b, 0x52, 0xfd, 0x2c, 0x8e, 0x79,
+    0x1f, 0x43, 0xfd, 0xa3, 0x69, 0x0b, 0x28, 0x93, 0x53, 0xa2, 0x8b, 0xbf,
+    0xe1, 0x17, 0xd0, 0xe4, 0xe8, 0xd0, 0xe8, 0xb8, 0xcf, 0x85, 0x77, 0x9a,
+    0xf6, 0xd4, 0x5e, 0xd6, 0x14, 0x3b, 0xd8, 0x06, 0x91, 0xa5, 0x1d, 0xc6,
+    0x7d, 0xb6, 0xc5, 0x93, 0xb4, 0x0f, 0xcc, 0xbc, 0xde, 0xe2, 0x54, 0x31,
+    0x57, 0x3f, 0xa1, 0xb3, 0xfd, 0xa3, 0x59, 0xe0, 0x43, 0x57, 0xb2, 0xe0,
+    0x3d, 0x73, 0x33, 0x41, 0x99, 0xd7, 0xcf, 0x0f, 0xdd, 0xb3, 0x1d, 0xa1,
+    0xcb, 0x80, 0xb0, 0xea, 0x6e, 0x7a, 0x06, 0x9c, 0x30, 0x10, 0x4f, 0xba,
+    0x30, 0x70, 0x37, 0x5b, 0xe3, 0xa7, 0x99, 0x42, 0xca, 0x4d, 0x2e, 0x74,
+    0x58, 0x78, 0x0d, 0x20, 0xe3, 0x87, 0xe3, 0x11, 0x38, 0xa5, 0x4b, 0x9c,
+    0xd2, 0x23, 0xfa, 0xf4, 0x1b, 0x3b, 0x31, 0x3d, 0x92, 0x17, 0xbd, 0xd4,
+    0xcf, 0xa9, 0x9d, 0xcc, 0xec, 0x27, 0xbe, 0x01, 0x6a, 0x5f, 0xa5, 0x91,
+    0xcf, 0x23, 0xe8, 0x7f, 0xb4, 0x6d, 0x21, 0x65, 0x12, 0x6a, 0x74, 0x51,
+    0x77, 0xfc, 0x22, 0xfa, 0x1c, 0x9d, 0x1a, 0x1d, 0x17, 0x19, 0xf0, 0xae,
+    0xf3, 0x5e, 0xda, 0x8b, 0xda, 0xc2, 0x87, 0x7b, 0x00, 0xd2, 0x34, 0xa3,
+    0xb8, 0xcf, 0xb6, 0xd8, 0xb2, 0x76, 0x81, 0xf9, 0x97, 0x9b, 0xdc, 0x4a,
+    0x86, 0x2a, 0xe7, 0xf4, 0x36, 0x7f, 0xb4, 0x6b, 0x3c, 0x08, 0x6a, 0xf6,
+    0x5c, 0x07, 0xae, 0x66, 0x68, 0x33, 0x3a, 0xf9, 0xe1, 0xfb, 0xb6, 0x63,
+    0xb4, 0x39, 0x70, 0x16, 0x1d, 0x4d, 0xcf, 0x40, 0xd3, 0x86, 0x02, 0x09,
+    0xf7, 0x46, 0x0e, 0x06, 0xda, 0x64, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed,
+    0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0,
+    0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e,
+    0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65,
+    0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe,
+    0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36,
+    0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5,
+    0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9,
+    0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7,
+    0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8, 0xed, 0xfd, 0x74, 0x54,
+    0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e, 0x1f, 0x77, 0xb3, 0x66,
+    0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b, 0xb4, 0xe3, 0x30, 0x75,
+    0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b, 0x83, 0x5c, 0xcd, 0xca,
+    0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20, 0x88, 0x54, 0xe4, 0x49,
+    0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8, 0x9d, 0xe4, 0x9a, 0x09,
+    0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01, 0xa4, 0x1c, 0x70, 0xfc,
+    0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f, 0x5e, 0x83, 0x67, 0x66,
+    0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1c, 0xed, 0x58, 0x51, 0xe8, 0xc8,
+    0xed, 0xfd, 0x74, 0x54, 0x95, 0x92, 0xa1, 0xa0, 0xf0, 0xf1, 0x39, 0x0e,
+    0x1f, 0x77, 0xb3, 0x66, 0xb2, 0x83, 0x37, 0x4e, 0x1a, 0xd0, 0x2f, 0x9b,
+    0xb4, 0xe3, 0x30, 0x75, 0xf5, 0x52, 0x42, 0x65, 0xe3, 0x9e, 0x7d, 0x6b,
+    0x83, 0x5c, 0xcd, 0xca, 0xad, 0x28, 0x53, 0xbe, 0xb6, 0xad, 0x46, 0x20,
+    0x88, 0x54, 0xe4, 0x49, 0x1d, 0xee, 0xcb, 0x36, 0x69, 0x66, 0x09, 0xa8,
+    0x9d, 0xe4, 0x9a, 0x09, 0xfc, 0x59, 0x49, 0xa5, 0xce, 0x8b, 0x0f, 0x01,
+    0xa4, 0x1c, 0x70, 0xfc, 0x62, 0x27, 0x14, 0xa9, 0x73, 0x9a, 0x44, 0x7f,
+    0x5e, 0x83, 0x67, 0x66, 0x27, 0xb2, 0x42, 0xf7, 0xba, 0x97, 0x1d, 0x80,
+};
+static_assert(sizeof(kBytesTestReadSymbol10) == kNumBytesTestReadSymbol10, "");
+
+// The kBytesTestReadSymbol11[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][12] = {
+//   // pmf: 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11, 1/11
+//   { 32768 - 2979, 32768 - 5958, 32768 - 8937, 32768 - 11916, 32768 - 14895,
+//     32768 - 17873, 32768 - 20852, 32768 - 23831, 32768 - 26810,
+//     32768 - 29789, 0, 0 },
+//   // pmf: 3/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 1/22
+//   { 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405, 32768 - 16384,
+//     32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+//     32768 - 31279, 0, 0 },
+//   // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 2/22, 3/22
+//   { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+//     32768 - 16384, 32768 - 19363, 32768 - 22342, 32768 - 25321,
+//     32768 - 28300, 0, 0 },
+//   // pmf: 1/22, 2/22, 2/22, 2/22, 2/22, 4/22, 2/22, 2/22, 2/22, 2/22, 1/22
+//   { 32768 - 1489, 32768 - 4468, 32768 - 7447, 32768 - 10426, 32768 - 13405,
+//     32768 - 19363, 32768 - 22342, 32768 - 25321, 32768 - 28300,
+//     32768 - 31279, 0, 0 },
+// };
+// constexpr int kSymbols[22][4] = { { 0, 6, 10, 5 },   //
+//                                   { 1, 7, 9, 4 },    //
+//                                   { 2, 8, 8, 3 },    //
+//                                   { 3, 9, 7, 2 },    //
+//                                   { 4, 10, 6, 1 },   //
+//                                   { 5, 0, 5, 0 },    //
+//                                   { 6, 1, 4, 10 },   //
+//                                   { 7, 2, 3, 9 },    //
+//                                   { 8, 3, 2, 8 },    //
+//                                   { 9, 4, 1, 7 },    //
+//                                   { 10, 5, 0, 6 },   //
+//                                   { 0, 0, 10, 9 },   //
+//                                   { 2, 1, 8, 7 },    //
+//                                   { 4, 3, 6, 5 },    //
+//                                   { 6, 5, 4, 3 },    //
+//                                   { 8, 7, 2, 1 },    //
+//                                   { 10, 9, 10, 8 },  //
+//                                   { 1, 0, 9, 6 },    //
+//                                   { 3, 2, 7, 4 },    //
+//                                   { 5, 4, 5, 2 },    //
+//                                   { 7, 6, 3, 5 },    //
+//                                   { 9, 8, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 96; ++i) {
+//   for (int j = 0; j < 22; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 11);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol11 = 3673;
+constexpr uint8_t kBytesTestReadSymbol11[] = {
+    0x0f, 0xb4, 0x93, 0xdb, 0xbe, 0x10, 0xa5, 0x0b, 0xa6, 0x53, 0x86, 0x25,
+    0xaf, 0x5e, 0xf9, 0xd6, 0x10, 0xd8, 0x5e, 0x2b, 0x6d, 0xf2, 0xf8, 0x35,
+    0x97, 0xf6, 0x95, 0xeb, 0x67, 0x20, 0x49, 0x0e, 0x21, 0xb4, 0x73, 0x5e,
+    0x72, 0x06, 0xdd, 0x76, 0x99, 0x3d, 0x67, 0x37, 0x27, 0xea, 0x21, 0x80,
+    0xc6, 0xb8, 0xf7, 0x48, 0x5e, 0x11, 0xe2, 0xe7, 0x10, 0xad, 0x0b, 0x12,
+    0x52, 0xd4, 0xe3, 0x63, 0x2a, 0x1d, 0x41, 0xf4, 0xce, 0x5d, 0x58, 0x5f,
+    0x79, 0x6d, 0xdd, 0x4b, 0x3d, 0x99, 0xd9, 0x64, 0xdc, 0x08, 0x16, 0x1a,
+    0xf3, 0x8f, 0x1e, 0x33, 0xfe, 0x7a, 0x49, 0xaa, 0x98, 0xb9, 0xe2, 0xc6,
+    0x14, 0xb8, 0x51, 0x1f, 0x45, 0xce, 0xea, 0x97, 0xcd, 0xd0, 0x0b, 0x5d,
+    0x12, 0x31, 0xbe, 0x78, 0x98, 0xa3, 0x77, 0x6a, 0xa0, 0xef, 0x57, 0x3a,
+    0xc6, 0xe7, 0x52, 0x22, 0x06, 0x44, 0x35, 0x8e, 0xc9, 0xe8, 0x4f, 0x76,
+    0xd9, 0x77, 0x8c, 0x80, 0xc9, 0xfc, 0x20, 0x0d, 0xc0, 0x67, 0x95, 0x21,
+    0x93, 0x74, 0x4f, 0xf1, 0xf5, 0xdf, 0x5a, 0x10, 0xde, 0x57, 0xc8, 0x6e,
+    0x33, 0x40, 0xae, 0x36, 0x4a, 0xc8, 0x49, 0xbf, 0x0d, 0x6d, 0x74, 0x34,
+    0xff, 0xdc, 0x1b, 0xe3, 0xcf, 0xcf, 0xe6, 0xd1, 0xfb, 0x4d, 0xd5, 0x0e,
+    0x86, 0x83, 0x21, 0x12, 0xf8, 0x51, 0x2a, 0xc4, 0x87, 0xd8, 0x1b, 0x1d,
+    0xe7, 0x36, 0xb5, 0xc3, 0xf9, 0xf9, 0x8f, 0x0f, 0xc2, 0x21, 0x83, 0x75,
+    0x14, 0x81, 0x17, 0xb1, 0x9b, 0x51, 0x56, 0x1d, 0xa1, 0xaa, 0xff, 0xd4,
+    0x1f, 0xf3, 0x8d, 0xd1, 0x30, 0x53, 0x92, 0x69, 0xce, 0xf0, 0xc5, 0x75,
+    0xcf, 0xd2, 0x6e, 0x37, 0x74, 0x79, 0xc3, 0x50, 0x52, 0x01, 0xc4, 0x0f,
+    0x67, 0xe2, 0xb7, 0xe2, 0xf1, 0xcc, 0xd9, 0x49, 0xc4, 0x58, 0xbd, 0x8d,
+    0x91, 0xb8, 0x35, 0xbd, 0x64, 0x12, 0x24, 0x20, 0x20, 0x29, 0x23, 0x94,
+    0x85, 0xb6, 0xa8, 0x4e, 0xd4, 0x49, 0x09, 0x25, 0xc4, 0xc5, 0xa5, 0x0c,
+    0x76, 0xa9, 0x4a, 0x75, 0x0f, 0xb9, 0x57, 0x33, 0xcd, 0xfd, 0xf8, 0x8f,
+    0xae, 0x43, 0x48, 0xb8, 0xea, 0x87, 0x17, 0x0d, 0x3d, 0x8b, 0x9a, 0x21,
+    0xe8, 0xbf, 0xc8, 0x5e, 0x18, 0x48, 0xa3, 0xcd, 0x08, 0x59, 0x9b, 0xdb,
+    0x79, 0x5c, 0xe9, 0xa3, 0xe6, 0xba, 0x58, 0x53, 0x10, 0x9a, 0x2c, 0x2b,
+    0x10, 0x5b, 0x96, 0x9a, 0x1f, 0x8f, 0xc2, 0x7d, 0xee, 0xe9, 0xc2, 0xbc,
+    0x8f, 0x8b, 0xa7, 0x41, 0xb1, 0x33, 0x58, 0x6e, 0x25, 0x13, 0x3a, 0xd0,
+    0x78, 0x53, 0xda, 0xa2, 0x35, 0x23, 0x89, 0x39, 0xa7, 0xef, 0x94, 0xda,
+    0x2f, 0xc3, 0x17, 0x80, 0x27, 0xc7, 0x0f, 0xda, 0xfb, 0xda, 0x64, 0x3c,
+    0x94, 0x8c, 0x39, 0xd0, 0x06, 0x62, 0x6c, 0x0d, 0x26, 0xba, 0x4f, 0xcb,
+    0x8a, 0xa0, 0xbc, 0xeb, 0x3f, 0x65, 0x51, 0x8e, 0x1d, 0x2e, 0x9e, 0x5f,
+    0xe3, 0x15, 0x0e, 0x58, 0x4f, 0xb7, 0xb6, 0x64, 0x95, 0xe8, 0x0e, 0x00,
+    0x7c, 0x1e, 0xd9, 0xde, 0x35, 0x5a, 0xff, 0xd5, 0xe5, 0xb3, 0x64, 0xcc,
+    0x8b, 0x93, 0xbc, 0x2a, 0x25, 0x7d, 0x50, 0x92, 0x3e, 0x23, 0x4c, 0x07,
+    0x5e, 0xcf, 0xbb, 0x52, 0xd0, 0xc4, 0xd9, 0x77, 0x66, 0x01, 0x57, 0x1f,
+    0xa0, 0x9d, 0xb2, 0x6d, 0x4e, 0x36, 0xc1, 0x9a, 0x70, 0x4e, 0xa3, 0x5f,
+    0xf6, 0xf9, 0x50, 0x08, 0xcd, 0xf9, 0xe5, 0x76, 0x81, 0xea, 0x88, 0x2e,
+    0xf5, 0x2a, 0xd4, 0x31, 0x39, 0x8d, 0xfe, 0x1c, 0x15, 0x1d, 0x41, 0x2b,
+    0x55, 0xc7, 0xe8, 0x27, 0x6f, 0xc3, 0xf0, 0x23, 0x76, 0x9a, 0xb2, 0x87,
+    0x0c, 0x71, 0x3c, 0x73, 0xea, 0x20, 0x93, 0xf4, 0x21, 0x56, 0xfb, 0x8e,
+    0xd7, 0xaf, 0xc3, 0xd4, 0xf4, 0x31, 0x6f, 0xe8, 0x1f, 0x5b, 0x83, 0xa9,
+    0x2b, 0x83, 0x08, 0x2e, 0xa2, 0xf3, 0x6c, 0x06, 0xe5, 0x89, 0x73, 0x73,
+    0x98, 0x0e, 0x57, 0x07, 0x49, 0x68, 0xa4, 0xb2, 0x4a, 0x26, 0xd1, 0x91,
+    0x49, 0x87, 0x05, 0x55, 0xa4, 0x88, 0x7d, 0x3d, 0x57, 0x7c, 0x20, 0x8c,
+    0x2c, 0xea, 0x30, 0x63, 0x3a, 0xe4, 0xab, 0x27, 0x80, 0xab, 0xfb, 0x22,
+    0x8a, 0x0f, 0xe0, 0xe9, 0xc5, 0xd5, 0x4f, 0x8a, 0x2c, 0x28, 0x36, 0x63,
+    0xbd, 0xa3, 0xc4, 0x90, 0xe4, 0x9e, 0x98, 0xca, 0xce, 0xfc, 0x96, 0xb8,
+    0x22, 0x0d, 0x17, 0xc8, 0xad, 0xc7, 0x01, 0x38, 0x6e, 0x95, 0x30, 0x74,
+    0xda, 0xb8, 0xa9, 0xa8, 0xe6, 0xf2, 0x03, 0x41, 0xb2, 0x05, 0x37, 0x04,
+    0x8b, 0x51, 0xf9, 0xeb, 0x97, 0xdf, 0xe9, 0xa8, 0x5f, 0x11, 0x2f, 0x9f,
+    0x4f, 0xbe, 0xc1, 0x53, 0x2c, 0x75, 0x90, 0xca, 0xa3, 0x9b, 0xc1, 0x36,
+    0xa3, 0x03, 0x65, 0xab, 0x57, 0xc4, 0x0e, 0x8a, 0x41, 0xfc, 0x60, 0x65,
+    0x13, 0x87, 0x6d, 0xda, 0x00, 0xad, 0x56, 0x1c, 0x28, 0x7c, 0x4c, 0xa2,
+    0x92, 0xda, 0x23, 0x00, 0xe8, 0x60, 0x20, 0x59, 0x45, 0x4a, 0x26, 0xae,
+    0x22, 0x37, 0x7c, 0x14, 0xce, 0xff, 0x0d, 0xa9, 0xef, 0xfc, 0x93, 0xbd,
+    0xde, 0x2b, 0x0f, 0xc7, 0xc0, 0x8a, 0x90, 0x06, 0xec, 0x53, 0x9f, 0xc8,
+    0x5b, 0x7b, 0xe8, 0x38, 0x22, 0x75, 0xe9, 0x40, 0xbc, 0x62, 0xe9, 0x9d,
+    0x49, 0xab, 0x88, 0x8d, 0xdf, 0x05, 0x33, 0xbf, 0xc3, 0x69, 0x6c, 0x36,
+    0x71, 0x17, 0x70, 0xc1, 0xe0, 0xd1, 0x71, 0xcf, 0xd5, 0x48, 0x83, 0x50,
+    0x74, 0x07, 0xc4, 0xca, 0x29, 0x2d, 0xa2, 0x30, 0x0e, 0x86, 0x02, 0x05,
+    0x94, 0x54, 0xa2, 0x6a, 0xe2, 0x23, 0x77, 0xc1, 0x4c, 0xef, 0xa4, 0x8c,
+    0xbe, 0x6b, 0x0f, 0x7c, 0x05, 0x30, 0x78, 0x34, 0x5c, 0x73, 0xf5, 0x52,
+    0x20, 0xd4, 0x1d, 0x01, 0xca, 0x9f, 0x89, 0x3b, 0x91, 0x1d, 0x1f, 0x27,
+    0xe1, 0xf9, 0xe8, 0xd0, 0xb2, 0x56, 0x32, 0x15, 0x37, 0xa3, 0x08, 0x38,
+    0xb7, 0x57, 0xb4, 0x09, 0xfe, 0xf4, 0x72, 0xe1, 0x8f, 0x4b, 0x6b, 0x00,
+    0x8c, 0xc5, 0x39, 0xd5, 0x45, 0x45, 0xbb, 0xf6, 0xb7, 0x01, 0xde, 0xef,
+    0x8b, 0xaf, 0x85, 0x73, 0xc4, 0x93, 0x3f, 0xbe, 0xf8, 0x69, 0xbd, 0x71,
+    0xa9, 0x65, 0x6f, 0x22, 0xa6, 0xca, 0x36, 0xf0, 0x34, 0x1b, 0x20, 0x24,
+    0x6c, 0xd2, 0xe3, 0xbb, 0xb5, 0x80, 0xfc, 0xc4, 0x90, 0x54, 0x70, 0xab,
+    0xb7, 0xb9, 0xdb, 0xeb, 0x3b, 0x1d, 0x75, 0xc8, 0x82, 0x9a, 0x15, 0x8a,
+    0x88, 0xb0, 0x7a, 0x77, 0xcf, 0xdc, 0x96, 0x22, 0x4d, 0x08, 0x47, 0x9a,
+    0x06, 0x3e, 0x47, 0xb1, 0x54, 0xdf, 0x22, 0x9d, 0x75, 0x8f, 0xdb, 0xc4,
+    0x5a, 0xd0, 0xfe, 0x44, 0xc4, 0xce, 0x9a, 0x57, 0x0b, 0x20, 0x36, 0x07,
+    0xb1, 0xcf, 0xfe, 0xb4, 0x3e, 0x03, 0x1b, 0x5d, 0xac, 0x40, 0x54, 0x88,
+    0x52, 0x2e, 0x81, 0x8f, 0x3c, 0x52, 0x87, 0x68, 0x00, 0xa5, 0x95, 0xbc,
+    0xd9, 0x67, 0x87, 0xa0, 0x75, 0x78, 0xb6, 0xa9, 0xda, 0x76, 0x9d, 0xe4,
+    0x5a, 0x6d, 0xd5, 0x78, 0xcd, 0x7b, 0x26, 0x5f, 0xc0, 0x09, 0xab, 0x25,
+    0x16, 0x38, 0xa1, 0x86, 0xa7, 0x5e, 0x5e, 0x2d, 0x3e, 0x2f, 0x09, 0xdc,
+    0x31, 0x4d, 0x71, 0x2e, 0xec, 0x5f, 0xa0, 0xe0, 0x8f, 0x9c, 0xcd, 0x72,
+    0xc8, 0x05, 0xa3, 0xb0, 0xfc, 0x4c, 0xdb, 0x6b, 0x24, 0xf2, 0x92, 0x6b,
+    0x13, 0x79, 0x1c, 0x36, 0x90, 0x20, 0x71, 0xaa, 0x8c, 0x1c, 0xe4, 0xbf,
+    0x54, 0xf8, 0x48, 0x51, 0xd2, 0x9a, 0x23, 0xa0, 0x55, 0x38, 0x24, 0x17,
+    0x39, 0x89, 0x4f, 0xc9, 0x01, 0x77, 0x05, 0x16, 0x97, 0x3e, 0xac, 0x9f,
+    0xba, 0x4a, 0xb1, 0x7e, 0x47, 0x0d, 0xa4, 0x08, 0x1c, 0x6a, 0xa3, 0x07,
+    0x39, 0x2f, 0xd5, 0x3e, 0x12, 0x14, 0x74, 0xa6, 0x88, 0xe8, 0x15, 0x4e,
+    0x09, 0x05, 0xce, 0x62, 0x53, 0xf2, 0x40, 0x7b, 0x49, 0x58, 0xc8, 0x5d,
+    0x29, 0x54, 0xb1, 0xfd, 0xb0, 0xb2, 0x75, 0x2c, 0x55, 0x9f, 0xf9, 0x57,
+    0x58, 0xec, 0xfb, 0xff, 0xa3, 0xa0, 0x27, 0x02, 0x0e, 0xa7, 0x52, 0xe7,
+    0x9e, 0xbd, 0xb6, 0x1d, 0xe6, 0x7e, 0xa2, 0xc0, 0x95, 0xe1, 0x4d, 0xd5,
+    0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67,
+    0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xe8, 0xe8, 0x09, 0xc0, 0x83, 0xa9,
+    0xd4, 0xb9, 0xe7, 0xaf, 0x6d, 0x87, 0x79, 0x9f, 0xa8, 0xb0, 0x25, 0x78,
+    0x92, 0x0e, 0x9d, 0xf7, 0x55, 0xd9, 0x1a, 0xc5, 0x48, 0x6c, 0xbe, 0x66,
+    0xb0, 0xf7, 0xbf, 0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xfa, 0x3a, 0x02, 0x70,
+    0x20, 0xde, 0xb0, 0xe4, 0xe4, 0x0e, 0x59, 0x44, 0x11, 0x28, 0xe1, 0x22,
+    0xe8, 0x0e, 0x5b, 0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31,
+    0xb9, 0x4a, 0x90, 0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xcf, 0x9c,
+    0x99, 0xd5, 0x85, 0x99, 0x5e, 0x0a, 0x51, 0x8d, 0x0d, 0x77, 0x3c, 0x51,
+    0xe1, 0x98, 0x1c, 0x5a, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb,
+    0xa1, 0x0f, 0x38, 0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6,
+    0xb7, 0x4f, 0x5e, 0xbd, 0x3f, 0xbc, 0xb6, 0xc6, 0x7b, 0xae, 0x23, 0x97,
+    0xc7, 0xcb, 0xa7, 0x98, 0x37, 0xf4, 0xd6, 0x0c, 0x12, 0xd6, 0xad, 0xc7,
+    0x51, 0xb3, 0x0e, 0x88, 0x40, 0xfd, 0xf7, 0x1b, 0x29, 0xcf, 0xb8, 0x7c,
+    0x29, 0xa1, 0xa2, 0x72, 0x05, 0xa1, 0x0f, 0x43, 0xa8, 0xc4, 0x24, 0x49,
+    0x96, 0xbf, 0x56, 0xe4, 0xbf, 0xc7, 0x71, 0x5a, 0x18, 0x85, 0x65, 0xdd,
+    0x17, 0x95, 0x30, 0x18, 0x8b, 0x18, 0xd2, 0xb2, 0x3f, 0x2e, 0xe9, 0x69,
+    0x89, 0x90, 0xe0, 0x24, 0x08, 0x13, 0x23, 0x0a, 0x78, 0x59, 0x1e, 0xe6,
+    0x33, 0x0f, 0x12, 0x73, 0xba, 0xb3, 0x3c, 0x1d, 0x05, 0x71, 0x7a, 0xd7,
+    0x87, 0xd3, 0xaa, 0x7c, 0xb9, 0x3f, 0x74, 0x95, 0x62, 0xfc, 0x85, 0xac,
+    0xe0, 0xe9, 0xaa, 0x6f, 0x48, 0x4b, 0xdf, 0xb6, 0x9a, 0x7c, 0x24, 0x28,
+    0xe3, 0x6e, 0x40, 0xbd, 0x03, 0xab, 0xc5, 0xb5, 0x4e, 0xd3, 0xb4, 0xef,
+    0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41, 0x6f, 0xf8, 0x5f, 0x41, 0xa5,
+    0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1, 0xd9, 0xf7, 0xff, 0xf0, 0x29,
+    0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6, 0x1a, 0xec, 0xa3, 0x4e, 0x59,
+    0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28, 0xd6, 0x43, 0x53, 0x6c, 0x6e,
+    0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf, 0x15, 0x1b, 0x6b, 0x46, 0x3a,
+    0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8, 0x3e, 0xe0, 0xc4, 0xb8, 0xf8,
+    0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d, 0x47, 0x12, 0x68, 0x9a, 0xf6,
+    0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf, 0xe5, 0xb2, 0x7f, 0x4f, 0xb9,
+    0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7, 0xc0, 0x81, 0x63, 0xff, 0x84,
+    0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b, 0xfb, 0x08, 0xd3, 0x09, 0xa8,
+    0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79, 0x0d, 0x74, 0xce, 0xe9, 0xc6,
+    0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2, 0x3d, 0x76, 0x94, 0x72, 0xed,
+    0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b, 0x1d, 0x75, 0xc8, 0xfd, 0x9a,
+    0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59, 0xf1, 0xbe, 0x8f, 0xe4, 0x3d,
+    0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc, 0x86, 0x2f, 0x9d, 0x75, 0x8c,
+    0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda, 0xa7, 0x69, 0xda, 0x77, 0x91,
+    0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7, 0xfc, 0x2f, 0xa0, 0xd2, 0x9d,
+    0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec, 0xfb, 0xff, 0xf8, 0x14, 0xef,
+    0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d, 0x76, 0x51, 0xa7, 0x2c, 0xa5,
+    0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b, 0x21, 0xa9, 0xb6, 0x37, 0x53,
+    0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a, 0x8d, 0xb5, 0xa3, 0x1d, 0x12,
+    0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f, 0x70, 0x62, 0x5c, 0x7c, 0x4b,
+    0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3, 0x89, 0x34, 0x4d, 0x7b, 0x6b,
+    0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2, 0xd9, 0x3f, 0xa7, 0xdc, 0xaf,
+    0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0, 0x40, 0xb1, 0xff, 0xc2, 0x18,
+    0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd, 0x84, 0x69, 0x84, 0xd4, 0x49,
+    0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86, 0xba, 0x67, 0x74, 0xe3, 0x41,
+    0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e, 0xbb, 0x4a, 0x39, 0x76, 0xd9,
+    0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e, 0xba, 0xe4, 0x7e, 0xcd, 0x0a,
+    0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8, 0xdf, 0x47, 0xf2, 0x1e, 0xee,
+    0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43, 0x17, 0xce, 0xba, 0xc6, 0x74,
+    0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53, 0xb4, 0xed, 0x3b, 0xc8, 0xc7,
+    0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe, 0x17, 0xd0, 0x69, 0x4e, 0xa6,
+    0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d, 0xff, 0xfc, 0x0a, 0x77, 0xc0,
+    0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb, 0x28, 0xd3, 0x96, 0x52, 0xf3,
+    0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90, 0xd4, 0xdb, 0x1b, 0xa9, 0x8d,
+    0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46, 0xda, 0xd1, 0x8e, 0x89, 0x64,
+    0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8, 0x31, 0x2e, 0x3e, 0x25, 0xa3,
+    0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4, 0x9a, 0x26, 0xbd, 0xb5, 0xdd,
+    0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c, 0x9f, 0xd3, 0xee, 0x57, 0xb5,
+    0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20, 0x58, 0xff, 0xe1, 0x0c, 0x19,
+    0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2, 0x34, 0xc2, 0x6a, 0x24, 0xc4,
+    0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d, 0x33, 0xba, 0x71, 0xa0, 0xf3,
+    0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d, 0xa5, 0x1c, 0xbb, 0x6c, 0xd9,
+    0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d, 0x72, 0x3f, 0x66, 0x85, 0x62,
+    0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f, 0xa3, 0xf9, 0x0f, 0x77, 0x5c,
+    0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b, 0xe7, 0x5d, 0x63, 0x3a, 0x6f,
+    0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda, 0x76, 0x9d, 0xe4, 0x63, 0xcd,
+    0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b, 0xe8, 0x34, 0xa7, 0x53, 0x15,
+    0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff, 0xfe, 0x05, 0x3b, 0xe0, 0x3d,
+    0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94, 0x69, 0xcb, 0x29, 0x79, 0xa0,
+    0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a, 0x6d, 0x8d, 0xd4, 0xc6, 0xe5,
+    0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d, 0x68, 0xc7, 0x44, 0xb2, 0x6b,
+    0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18, 0x97, 0x1f, 0x12, 0xd1, 0xdb,
+    0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d, 0x13, 0x5e, 0xda, 0xee, 0x84,
+    0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f, 0xe9, 0xf7, 0x2b, 0xda, 0xde,
+    0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c, 0x7f, 0xf0, 0x86, 0x0c, 0xe8,
+    0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a, 0x61, 0x35, 0x12, 0x62, 0x3e,
+    0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99, 0xdd, 0x38, 0xd0, 0x79, 0xbc,
+    0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2, 0x8e, 0x5d, 0xb6, 0x6c, 0xd3,
+    0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9, 0x1f, 0xb3, 0x42, 0xb1, 0x4f,
+    0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1, 0xfc, 0x87, 0xbb, 0xae, 0x53,
+    0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3, 0xae, 0xb1, 0x9d, 0x37, 0xfa,
+    0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b, 0x4e, 0xf2, 0x31, 0xe6, 0xea,
+    0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4, 0x1a, 0x53, 0xa9, 0x8a, 0xb3,
+    0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff, 0x02, 0x9d, 0xf0, 0x1e, 0xde,
+    0x9a, 0x34, 0x9c, 0x61, 0xae, 0xca, 0x34, 0xe5, 0x94, 0xbc, 0xd0, 0x1c,
+    0xb6, 0xc4, 0xd2, 0x8d, 0x64, 0x35, 0x36, 0xc6, 0xea, 0x63, 0x72, 0x95,
+    0x21, 0x1a, 0x5d, 0xf1, 0x51, 0xb6, 0xb4, 0x63, 0xa2, 0x59, 0x35, 0xc7,
+    0x6d, 0xc1, 0x2b, 0x83, 0xee, 0x0c, 0x4b, 0x8f, 0x89, 0x68, 0xed, 0xe4,
+    0x9f, 0xf5, 0x83, 0xd4, 0x71, 0x26, 0x89, 0xaf, 0x6d, 0x77, 0x42, 0x1e,
+    0x70, 0xea, 0xbd, 0xfe, 0x5b, 0x27, 0xf4, 0xfb, 0x95, 0xed, 0x6f, 0x77,
+    0xac, 0x87, 0xed, 0x7c, 0x08, 0x16, 0x3f, 0xf8, 0x43, 0x06, 0x74, 0x09,
+    0x5c, 0xb0, 0x36, 0xbf, 0xb0, 0x8d, 0x30, 0x9a, 0x89, 0x31, 0x1f, 0x7f,
+    0x36, 0x88, 0x97, 0x90, 0xd7, 0x4c, 0xee, 0x9c, 0x68, 0x3c, 0xde, 0x05,
+    0x45, 0x1f, 0xfe, 0x23, 0xd7, 0x69, 0x47, 0x2e, 0xdb, 0x36, 0x69, 0x89,
+    0x7d, 0x90, 0xb3, 0xb1, 0xd7, 0x5c, 0x8f, 0xd9, 0xa1, 0x58, 0xa7, 0xce,
+    0x9b, 0x68, 0xe5, 0x9f, 0x1b, 0xe8, 0xfe, 0x43, 0xdd, 0xd7, 0x29, 0x87,
+    0x1e, 0x5e, 0xfd, 0xc8, 0x62, 0xf9, 0xd7, 0x58, 0xce, 0x9b, 0xfd, 0x18,
+    0x9a, 0xe4, 0x4d, 0xaa, 0x76, 0x9d, 0xa7, 0x79, 0x18, 0xf3, 0x75, 0x5e,
+    0x33, 0x82, 0x0b, 0x7f, 0xc2, 0xfa, 0x0d, 0x29, 0xd4, 0xc5, 0x59, 0xff,
+    0x95, 0x75, 0x8e, 0xcf, 0xbf, 0xff, 0x81, 0x4e, 0xf8, 0x0f, 0x6f, 0x4d,
+    0x1a, 0x4e, 0x30, 0xd7, 0x65, 0x1a, 0x72, 0xca, 0x5e, 0x68, 0x0e, 0x5b,
+    0x62, 0x69, 0x46, 0xb2, 0x1a, 0x9b, 0x63, 0x75, 0x31, 0xb9, 0x4a, 0x90,
+    0x8d, 0x2e, 0xf8, 0xa8, 0xdb, 0x5a, 0x31, 0xd1, 0x2c, 0x9a, 0xe3, 0xb6,
+    0xe0, 0x95, 0xc1, 0xf7, 0x06, 0x25, 0xc7, 0xc4, 0xb4, 0x76, 0xf2, 0x4f,
+    0xfa, 0xc1, 0xea, 0x38, 0x93, 0x44, 0xd7, 0xb6, 0xbb, 0xa1, 0x0f, 0x38,
+    0x75, 0x5e, 0xff, 0x2d, 0x93, 0xfa, 0x7d, 0xca, 0xf6, 0xb7, 0xbb, 0xd6,
+    0x43, 0xf6, 0xbe, 0x04, 0x0b, 0x1f, 0xfc, 0x21, 0x83, 0x3a, 0x04, 0xae,
+    0x58, 0x1b, 0x5f, 0xd8, 0x46, 0x98, 0x4d, 0x44, 0x98, 0x8f, 0xbf, 0x9b,
+    0x44, 0x4b, 0xc8, 0x6b, 0xa6, 0x77, 0x4e, 0x34, 0x1e, 0x6f, 0x02, 0xa2,
+    0x8f, 0xff, 0x11, 0xeb, 0xb4, 0xa3, 0x97, 0x6d, 0x9b, 0x34, 0xc4, 0xbe,
+    0xc8, 0x59, 0xd8, 0xeb, 0xae, 0x47, 0xec, 0xd0, 0xac, 0x53, 0xe7, 0x4d,
+    0xb4, 0x72, 0xcf, 0x8d, 0xf4, 0x7f, 0x21, 0xee, 0xeb, 0x94, 0xc3, 0x8f,
+    0x2f, 0x7e, 0xe4, 0x31, 0x7c, 0xeb, 0xac, 0x67, 0x4d, 0xfe, 0x8c, 0x4d,
+    0x72, 0x26, 0xd5, 0x3b, 0x4e, 0xd3, 0xbc, 0x8c, 0x79, 0xba, 0xaf, 0x19,
+    0xc1, 0x05, 0xbf, 0xe1, 0x7d, 0x06, 0x94, 0xea, 0x62, 0xac, 0xff, 0xca,
+    0xba, 0xc7, 0x67, 0xdf, 0xff, 0xc0, 0xa7, 0x7c, 0x07, 0xb7, 0xa6, 0x8d,
+    0x27, 0x18, 0x6b, 0xb2, 0x8d, 0x39, 0x65, 0x2f, 0x34, 0x07, 0x2d, 0xb1,
+    0x34, 0xa3, 0x59, 0x0d, 0x4d, 0xb1, 0xba, 0x98, 0xdc, 0xa5, 0x48, 0x46,
+    0x97, 0x7c, 0x54, 0x6d, 0xad, 0x18, 0xe8, 0x96, 0x4d, 0x71, 0xdb, 0x70,
+    0x4a, 0xe0, 0xfb, 0x83, 0x12, 0xe3, 0xe2, 0x5a, 0x3b, 0x79, 0x27, 0xfd,
+    0x60, 0xf5, 0x1c, 0x49, 0xa2, 0x6b, 0xdb, 0x5d, 0xd0, 0x87, 0x9c, 0x3a,
+    0xaf, 0x7f, 0x96, 0xc9, 0xfd, 0x3e, 0xe5, 0x7b, 0x5b, 0xdd, 0xeb, 0x21,
+    0xfb, 0x5f, 0x02, 0x05, 0x8f, 0xfe, 0x10, 0xc1, 0x9d, 0x02, 0x57, 0x2c,
+    0x0d, 0xaf, 0xec, 0x23, 0x4c, 0x26, 0xa2, 0x4c, 0x47, 0xdf, 0xcd, 0xa2,
+    0x25, 0xe4, 0x35, 0xd3, 0x3b, 0xa7, 0x1a, 0x0f, 0x37, 0x81, 0x51, 0x47,
+    0xff, 0x88, 0xf5, 0xda, 0x51, 0xcb, 0xb6, 0xcd, 0x9a, 0x62, 0x5f, 0x64,
+    0x2c, 0xec, 0x75, 0xd7, 0x23, 0xf6, 0x68, 0x56, 0x29, 0xf3, 0xa6, 0xda,
+    0x39, 0x67, 0xc6, 0xfa, 0x3f, 0x90, 0xf7, 0x75, 0xca, 0x61, 0xc7, 0x97,
+    0xbf, 0x72, 0x18, 0xbe, 0x75, 0xd6, 0x33, 0xa6, 0xff, 0x46, 0x26, 0xb9,
+    0x13, 0x6a, 0x9d, 0xa7, 0x69, 0xde, 0x46, 0x3c, 0xdd, 0x57, 0x8c, 0xe0,
+    0x82, 0xdf, 0xf0, 0xbe, 0x83, 0x4a, 0x75, 0x31, 0x56, 0x7f, 0xe5, 0x5d,
+    0x63, 0xb3, 0xef, 0xff, 0xe0, 0x53, 0xbe, 0x03, 0xdb, 0xd3, 0x46, 0x93,
+    0x8c, 0x35, 0xd9, 0x46, 0x9c, 0xb2, 0x97, 0x9a, 0x03, 0x96, 0xd8, 0x9a,
+    0x51, 0xac, 0x86, 0xa6, 0xd8, 0xdd, 0x4c, 0x6e, 0x52, 0xa4, 0x23, 0x4b,
+    0xbe, 0x2a, 0x36, 0xd6, 0x8c, 0x74, 0x4b, 0x26, 0xb8, 0xed, 0xb8, 0x25,
+    0x70, 0x7d, 0xc1, 0x89, 0x71, 0xf1, 0x2d, 0x1d, 0xbc, 0x93, 0xfe, 0xb0,
+    0x7a, 0x8e, 0x24, 0xd1, 0x35, 0xed, 0xae, 0xe8, 0x43, 0xce, 0x1d, 0x57,
+    0xbf, 0xcb, 0x64, 0xfe, 0x9f, 0x72, 0xbd, 0xad, 0xee, 0xf5, 0x90, 0xfd,
+    0xaf, 0x81, 0x02, 0xc7, 0xff, 0x08, 0x60, 0xce, 0x81, 0x2b, 0x96, 0x06,
+    0xd7, 0xf6, 0x11, 0xa6, 0x13, 0x51, 0x26, 0x23, 0xef, 0xe6, 0xd1, 0x12,
+    0xf2, 0x1a, 0xe9, 0x9d, 0xd3, 0x8d, 0x07, 0x9b, 0xc0, 0xa8, 0xa3, 0xff,
+    0xc4, 0x7a, 0xed, 0x28, 0xe5, 0xdb, 0x66, 0xcd, 0x31, 0x2f, 0xb2, 0x16,
+    0x76, 0x3a, 0xeb, 0x91, 0xfb, 0x34, 0x2b, 0x14, 0xf9, 0xd3, 0x6d, 0x1c,
+    0xb3, 0xe3, 0x7d, 0x1f, 0xc8, 0x7b, 0xba, 0xe5, 0x30, 0xe3, 0xcb, 0xdf,
+    0xb9, 0x0c, 0x5f, 0x3a, 0xeb, 0x19, 0xd3, 0x7f, 0xa3, 0x13, 0x5c, 0x89,
+    0xb5, 0x4e, 0xd3, 0xb4, 0xef, 0x23, 0x1e, 0x6e, 0xab, 0xc6, 0x70, 0x41,
+    0x6f, 0xf8, 0x5f, 0x41, 0xa5, 0x3a, 0x98, 0xab, 0x3f, 0xf2, 0xae, 0xb1,
+    0xd9, 0xf7, 0xff, 0xf0, 0x29, 0xdf, 0x01, 0xed, 0xe9, 0xa3, 0x49, 0xc6,
+    0x1a, 0xec, 0xa3, 0x4e, 0x59, 0x4b, 0xcd, 0x01, 0xcb, 0x6c, 0x4d, 0x28,
+    0xd6, 0x43, 0x53, 0x6c, 0x6e, 0xa6, 0x37, 0x29, 0x52, 0x11, 0xa5, 0xdf,
+    0x15, 0x1b, 0x6b, 0x46, 0x3a, 0x25, 0x93, 0x5c, 0x76, 0xdc, 0x12, 0xb8,
+    0x3e, 0xe0, 0xc4, 0xb8, 0xf8, 0x96, 0x8e, 0xde, 0x49, 0xff, 0x58, 0x3d,
+    0x47, 0x12, 0x68, 0x9a, 0xf6, 0xd7, 0x74, 0x21, 0xe7, 0x0e, 0xab, 0xdf,
+    0xe5, 0xb2, 0x7f, 0x4f, 0xb9, 0x5e, 0xd6, 0xf7, 0x7a, 0xc8, 0x7e, 0xd7,
+    0xc0, 0x81, 0x63, 0xff, 0x84, 0x30, 0x67, 0x40, 0x95, 0xcb, 0x03, 0x6b,
+    0xfb, 0x08, 0xd3, 0x09, 0xa8, 0x93, 0x11, 0xf7, 0xf3, 0x68, 0x89, 0x79,
+    0x0d, 0x74, 0xce, 0xe9, 0xc6, 0x83, 0xcd, 0xe0, 0x54, 0x51, 0xff, 0xe2,
+    0x3d, 0x76, 0x94, 0x72, 0xed, 0xb3, 0x66, 0x98, 0x97, 0xd9, 0x0b, 0x3b,
+    0x1d, 0x75, 0xc8, 0xfd, 0x9a, 0x15, 0x8a, 0x7c, 0xe9, 0xb6, 0x8e, 0x59,
+    0xf1, 0xbe, 0x8f, 0xe4, 0x3d, 0xdd, 0x72, 0x98, 0x71, 0xe5, 0xef, 0xdc,
+    0x86, 0x2f, 0x9d, 0x75, 0x8c, 0xe9, 0xbf, 0xd1, 0x89, 0xae, 0x44, 0xda,
+    0xa7, 0x69, 0xda, 0x77, 0x91, 0x8f, 0x37, 0x55, 0xe3, 0x38, 0x20, 0xb7,
+    0xfc, 0x2f, 0xa0, 0xd2, 0x9d, 0x4c, 0x55, 0x9f, 0xf9, 0x57, 0x58, 0xec,
+    0xfb, 0xff, 0xf8, 0x14, 0xef, 0x80, 0xf6, 0xf4, 0xd1, 0xa4, 0xe3, 0x0d,
+    0x76, 0x51, 0xa7, 0x2c, 0xa5, 0xe6, 0x80, 0xe5, 0xb6, 0x26, 0x94, 0x6b,
+    0x21, 0xa9, 0xb6, 0x37, 0x53, 0x1b, 0x94, 0xa9, 0x08, 0xd2, 0xef, 0x8a,
+    0x8d, 0xb5, 0xa3, 0x1d, 0x12, 0xc9, 0xae, 0x3b, 0x6e, 0x09, 0x5c, 0x1f,
+    0x70, 0x62, 0x5c, 0x7c, 0x4b, 0x47, 0x6f, 0x24, 0xff, 0xac, 0x1e, 0xa3,
+    0x89, 0x34, 0x4d, 0x7b, 0x6b, 0xba, 0x10, 0xf3, 0x87, 0x55, 0xef, 0xf2,
+    0xd9, 0x3f, 0xa7, 0xdc, 0xaf, 0x6b, 0x7b, 0xbd, 0x64, 0x3f, 0x6b, 0xe0,
+    0x40, 0xb1, 0xff, 0xc2, 0x18, 0x33, 0xa0, 0x4a, 0xe5, 0x81, 0xb5, 0xfd,
+    0x84, 0x69, 0x84, 0xd4, 0x49, 0x88, 0xfb, 0xf9, 0xb4, 0x44, 0xbc, 0x86,
+    0xba, 0x67, 0x74, 0xe3, 0x41, 0xe6, 0xf0, 0x2a, 0x28, 0xff, 0xf1, 0x1e,
+    0xbb, 0x4a, 0x39, 0x76, 0xd9, 0xb3, 0x4c, 0x4b, 0xec, 0x85, 0x9d, 0x8e,
+    0xba, 0xe4, 0x7e, 0xcd, 0x0a, 0xc5, 0x3e, 0x74, 0xdb, 0x47, 0x2c, 0xf8,
+    0xdf, 0x47, 0xf2, 0x1e, 0xee, 0xb9, 0x4c, 0x38, 0xf2, 0xf7, 0xee, 0x43,
+    0x17, 0xce, 0xba, 0xc6, 0x74, 0xdf, 0xe8, 0xc4, 0xd7, 0x22, 0x6d, 0x53,
+    0xb4, 0xed, 0x3b, 0xc8, 0xc7, 0x9b, 0xaa, 0xf1, 0x9c, 0x10, 0x5b, 0xfe,
+    0x17, 0xd0, 0x69, 0x4e, 0xa6, 0x2a, 0xcf, 0xfc, 0xab, 0xac, 0x76, 0x7d,
+    0xff, 0xfc, 0x0a, 0x77, 0xc0, 0x7b, 0x7a, 0x68, 0xd2, 0x71, 0x86, 0xbb,
+    0x28, 0xd3, 0x96, 0x52, 0xf3, 0x40, 0x72, 0xdb, 0x13, 0x4a, 0x35, 0x90,
+    0xd4, 0xdb, 0x1b, 0xa9, 0x8d, 0xca, 0x54, 0x84, 0x69, 0x77, 0xc5, 0x46,
+    0xda, 0xd1, 0x8e, 0x89, 0x64, 0xd7, 0x1d, 0xb7, 0x04, 0xae, 0x0f, 0xb8,
+    0x31, 0x2e, 0x3e, 0x25, 0xa3, 0xb7, 0x92, 0x7f, 0xd6, 0x0f, 0x51, 0xc4,
+    0x9a, 0x26, 0xbd, 0xb5, 0xdd, 0x08, 0x79, 0xc3, 0xaa, 0xf7, 0xf9, 0x6c,
+    0x9f, 0xd3, 0xee, 0x57, 0xb5, 0xbd, 0xde, 0xb2, 0x1f, 0xb5, 0xf0, 0x20,
+    0x58, 0xff, 0xe1, 0x0c, 0x19, 0xd0, 0x25, 0x72, 0xc0, 0xda, 0xfe, 0xc2,
+    0x34, 0xc2, 0x6a, 0x24, 0xc4, 0x7d, 0xfc, 0xda, 0x22, 0x5e, 0x43, 0x5d,
+    0x33, 0xba, 0x71, 0xa0, 0xf3, 0x78, 0x15, 0x14, 0x7f, 0xf8, 0x8f, 0x5d,
+    0xa5, 0x1c, 0xbb, 0x6c, 0xd9, 0xa6, 0x25, 0xf6, 0x42, 0xce, 0xc7, 0x5d,
+    0x72, 0x3f, 0x66, 0x85, 0x62, 0x9f, 0x3a, 0x6d, 0xa3, 0x96, 0x7c, 0x6f,
+    0xa3, 0xf9, 0x0f, 0x77, 0x5c, 0xa6, 0x1c, 0x79, 0x7b, 0xf7, 0x21, 0x8b,
+    0xe7, 0x5d, 0x63, 0x3a, 0x6f, 0xf4, 0x62, 0x6b, 0x91, 0x36, 0xa9, 0xda,
+    0x76, 0x9d, 0xe4, 0x63, 0xcd, 0xd5, 0x78, 0xce, 0x08, 0x2d, 0xff, 0x0b,
+    0xe8, 0x34, 0xa7, 0x53, 0x15, 0x67, 0xfe, 0x55, 0xd6, 0x3b, 0x3e, 0xff,
+    0xfe, 0x05, 0x3b, 0xe0, 0x3d, 0xbd, 0x34, 0x69, 0x38, 0xc3, 0x5d, 0x94,
+    0x69, 0xcb, 0x29, 0x79, 0xa0, 0x39, 0x6d, 0x89, 0xa5, 0x1a, 0xc8, 0x6a,
+    0x6d, 0x8d, 0xd4, 0xc6, 0xe5, 0x2a, 0x42, 0x34, 0xbb, 0xe2, 0xa3, 0x6d,
+    0x68, 0xc7, 0x44, 0xb2, 0x6b, 0x8e, 0xdb, 0x82, 0x57, 0x07, 0xdc, 0x18,
+    0x97, 0x1f, 0x12, 0xd1, 0xdb, 0xc9, 0x3f, 0xeb, 0x07, 0xa8, 0xe2, 0x4d,
+    0x13, 0x5e, 0xda, 0xee, 0x84, 0x3c, 0xe1, 0xd5, 0x7b, 0xfc, 0xb6, 0x4f,
+    0xe9, 0xf7, 0x2b, 0xda, 0xde, 0xef, 0x59, 0x0f, 0xda, 0xf8, 0x10, 0x2c,
+    0x7f, 0xf0, 0x86, 0x0c, 0xe8, 0x12, 0xb9, 0x60, 0x6d, 0x7f, 0x61, 0x1a,
+    0x61, 0x35, 0x12, 0x62, 0x3e, 0xfe, 0x6d, 0x11, 0x2f, 0x21, 0xae, 0x99,
+    0xdd, 0x38, 0xd0, 0x79, 0xbc, 0x0a, 0x8a, 0x3f, 0xfc, 0x47, 0xae, 0xd2,
+    0x8e, 0x5d, 0xb6, 0x6c, 0xd3, 0x12, 0xfb, 0x21, 0x67, 0x63, 0xae, 0xb9,
+    0x1f, 0xb3, 0x42, 0xb1, 0x4f, 0x9d, 0x36, 0xd1, 0xcb, 0x3e, 0x37, 0xd1,
+    0xfc, 0x87, 0xbb, 0xae, 0x53, 0x0e, 0x3c, 0xbd, 0xfb, 0x90, 0xc5, 0xf3,
+    0xae, 0xb1, 0x9d, 0x37, 0xfa, 0x31, 0x35, 0xc8, 0x9b, 0x54, 0xed, 0x3b,
+    0x4e, 0xf2, 0x31, 0xe6, 0xea, 0xbc, 0x67, 0x04, 0x16, 0xff, 0x85, 0xf4,
+    0x1a, 0x53, 0xa9, 0x8a, 0xb3, 0xff, 0x2a, 0xeb, 0x1d, 0x9f, 0x7f, 0xff,
+    0x08,
+};
+static_assert(sizeof(kBytesTestReadSymbol11) == kNumBytesTestReadSymbol11, "");
+
+// The kBytesTestReadSymbol12[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][13] = {
+//   // pmf: 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12,
+//   // 1/12
+//   { 32768 - 2731, 32768 - 5461, 32768 - 8192, 32768 - 10923, 32768 - 13653,
+//     32768 - 16384, 32768 - 19115, 32768 - 21845, 32768 - 24576,
+//     32768 - 27307, 32768 - 30037, 0, 0 },
+//   // pmf: 3/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+//   // 1/24
+//   { 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288, 32768 - 15019,
+//     32768 - 17749, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+//     32768 - 28672, 32768 - 31403, 0, 0 },
+//   // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24, 2/24,
+//   // 3/24
+//   { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+//     32768 - 15019, 32768 - 17749, 32768 - 20480, 32768 - 23211,
+//     32768 - 25941, 32768 - 28672, 0, 0 },
+//   // pmf: 1/24, 2/24, 2/24, 2/24, 2/24, 3/24, 3/24, 2/24, 2/24, 2/24, 2/24,
+//   // 1/24
+//   { 32768 - 1365, 32768 - 4096, 32768 - 6827, 32768 - 9557, 32768 - 12288,
+//     32768 - 16384, 32768 - 20480, 32768 - 23211, 32768 - 25941,
+//     32768 - 28672, 32768 - 31403, 0, 0 },
+// };
+// constexpr int kSymbols[24][4] = { { 0, 6, 11, 5 },   //
+//                                   { 1, 7, 10, 4 },   //
+//                                   { 2, 8, 9, 3 },    //
+//                                   { 3, 9, 8, 2 },    //
+//                                   { 4, 10, 7, 1 },   //
+//                                   { 5, 11, 6, 0 },   //
+//                                   { 6, 0, 5, 11 },   //
+//                                   { 7, 1, 4, 10 },   //
+//                                   { 8, 2, 3, 9 },    //
+//                                   { 9, 3, 2, 8 },    //
+//                                   { 10, 4, 1, 7 },   //
+//                                   { 11, 5, 0, 6 },   //
+//                                   { 0, 0, 11, 9 },   //
+//                                   { 2, 1, 10, 7 },   //
+//                                   { 4, 3, 8, 5 },    //
+//                                   { 6, 5, 6, 3 },    //
+//                                   { 8, 7, 4, 1 },    //
+//                                   { 10, 9, 2, 10 },  //
+//                                   { 1, 0, 11, 8 },   //
+//                                   { 3, 2, 9, 6 },    //
+//                                   { 5, 4, 7, 4 },    //
+//                                   { 7, 6, 5, 2 },    //
+//                                   { 9, 8, 3, 6 },    //
+//                                   { 11, 10, 1, 5 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 80; ++i) {
+//   for (int j = 0; j < 24; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 12);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol12 = 3473;
+constexpr uint8_t kBytesTestReadSymbol12[] = {
+    0x0d, 0x17, 0xf5, 0xbd, 0x05, 0xd0, 0x9c, 0x5d, 0x10, 0xc5, 0x9e, 0xc4,
+    0x9f, 0xc6, 0xf4, 0x7d, 0xce, 0x67, 0x97, 0x49, 0xd1, 0x05, 0x54, 0xab,
+    0xda, 0x22, 0x5b, 0xbc, 0x9c, 0x11, 0xc8, 0x0b, 0xe9, 0x6d, 0xb1, 0x8a,
+    0x17, 0x06, 0x92, 0xed, 0xd4, 0x61, 0x48, 0x01, 0x64, 0x43, 0x65, 0x65,
+    0xfc, 0x35, 0x9d, 0xbb, 0x68, 0x3f, 0x77, 0xbc, 0x8d, 0xd9, 0x3b, 0x48,
+    0x77, 0x58, 0x2f, 0x19, 0xfa, 0x73, 0xa6, 0xc3, 0x65, 0x96, 0x6c, 0x9d,
+    0x99, 0xb8, 0x65, 0x2b, 0x94, 0x11, 0x21, 0xf4, 0x95, 0xa4, 0xcd, 0xf2,
+    0xbf, 0x65, 0x79, 0x34, 0x4b, 0xf6, 0x5c, 0xeb, 0xca, 0x07, 0x65, 0x4f,
+    0xae, 0x67, 0xd8, 0xdf, 0xec, 0xc9, 0xd2, 0x26, 0x2e, 0xac, 0xea, 0xa2,
+    0xbd, 0x0d, 0x79, 0x27, 0x91, 0xf5, 0x84, 0x89, 0xf9, 0x2a, 0xb3, 0x5e,
+    0x48, 0x4b, 0x2b, 0x89, 0xc0, 0xa5, 0x9f, 0x94, 0x07, 0x82, 0x36, 0x11,
+    0x65, 0x4d, 0xb0, 0xde, 0xac, 0xde, 0xac, 0xc0, 0x35, 0x7f, 0xf3, 0x9b,
+    0x01, 0x0c, 0x35, 0x8b, 0xb5, 0x22, 0xb8, 0xea, 0x1c, 0xab, 0xbe, 0x08,
+    0xd9, 0x23, 0x0a, 0x37, 0x95, 0x36, 0x3d, 0x28, 0xb3, 0x19, 0x34, 0x3a,
+    0x47, 0xf8, 0x45, 0x33, 0x7a, 0x65, 0xae, 0x80, 0x48, 0x01, 0x20, 0xe8,
+    0xcd, 0xb7, 0xce, 0xf7, 0xee, 0xd1, 0x50, 0x39, 0xec, 0xa6, 0x8b, 0xa0,
+    0xb5, 0x56, 0x76, 0x1a, 0xb4, 0x6b, 0x31, 0xcf, 0x32, 0x0f, 0xb1, 0xba,
+    0xb3, 0xa4, 0xb7, 0x34, 0xfe, 0x86, 0x87, 0xa7, 0x44, 0x70, 0x3b, 0x9e,
+    0x94, 0xc5, 0x43, 0x82, 0xf1, 0x1a, 0xa1, 0x10, 0x05, 0x7c, 0x04, 0x63,
+    0x5a, 0xfe, 0xc2, 0xb6, 0x15, 0x07, 0x3f, 0xb0, 0x3c, 0x43, 0x74, 0x33,
+    0xec, 0xb8, 0xe0, 0xf5, 0x79, 0x48, 0x7c, 0x50, 0x4f, 0x4b, 0xb9, 0x08,
+    0x33, 0xfd, 0x54, 0xd5, 0x6f, 0xdf, 0xca, 0xfe, 0x38, 0xa1, 0xeb, 0xa9,
+    0xaf, 0xa5, 0x8f, 0xcf, 0xb3, 0xda, 0x77, 0x3f, 0x63, 0xcb, 0x98, 0x2b,
+    0x71, 0x56, 0x60, 0xb4, 0x5c, 0x7d, 0x81, 0x85, 0xf3, 0x64, 0x9f, 0xf3,
+    0xc2, 0xec, 0x2a, 0x27, 0x9b, 0x5e, 0x39, 0x30, 0x10, 0x0d, 0x43, 0xdb,
+    0x9f, 0x7b, 0x8f, 0xb8, 0x09, 0xe2, 0x55, 0xb3, 0xc4, 0xb1, 0xeb, 0x23,
+    0xcd, 0x32, 0xde, 0x58, 0xc2, 0x35, 0xda, 0x5c, 0x9a, 0xf8, 0x2d, 0xc6,
+    0x19, 0x46, 0x64, 0x66, 0x5a, 0xdb, 0x53, 0xc8, 0x14, 0x41, 0xcc, 0x0c,
+    0x3f, 0xff, 0x3e, 0xbe, 0x29, 0xba, 0x5f, 0x68, 0xa9, 0x31, 0x39, 0x79,
+    0x2a, 0xfe, 0x14, 0x92, 0x8f, 0x2b, 0x31, 0xf1, 0x0a, 0x25, 0xd8, 0x22,
+    0xe1, 0xc7, 0xcd, 0xda, 0xea, 0x88, 0xfa, 0x6a, 0xb0, 0x69, 0x77, 0xf6,
+    0xd6, 0x46, 0xb9, 0xe6, 0x53, 0x09, 0x48, 0x65, 0xbd, 0xe6, 0xf8, 0xc0,
+    0x04, 0x71, 0x26, 0x21, 0xe8, 0xf9, 0xc1, 0x71, 0x73, 0x6b, 0x3d, 0x73,
+    0x16, 0x66, 0x38, 0xae, 0x59, 0xb9, 0xe3, 0x34, 0x8f, 0x17, 0x3c, 0x16,
+    0xaa, 0x3f, 0x61, 0x49, 0xb3, 0x06, 0xcc, 0xb3, 0xcb, 0x7e, 0x42, 0xf1,
+    0x2a, 0x0e, 0xb2, 0xcb, 0x1d, 0xf0, 0x0f, 0xc9, 0x20, 0xb1, 0x80, 0xce,
+    0x08, 0xb9, 0xfa, 0xca, 0x3c, 0xd5, 0x67, 0x47, 0x36, 0x17, 0xc1, 0xf7,
+    0x9d, 0x97, 0x79, 0x75, 0xee, 0xb0, 0xed, 0xfc, 0xd0, 0xdf, 0xc8, 0xa2,
+    0xc1, 0xae, 0x51, 0x53, 0x88, 0x05, 0x95, 0x73, 0x7e, 0xd9, 0x3b, 0x9d,
+    0xb0, 0x08, 0x37, 0xff, 0x51, 0x6f, 0xf9, 0xad, 0x60, 0xa5, 0x3a, 0xd6,
+    0xba, 0xea, 0xf6, 0xea, 0x91, 0x2e, 0x5a, 0xa9, 0xbf, 0xe2, 0x52, 0x46,
+    0x0c, 0xbd, 0x28, 0x2d, 0xa8, 0x5f, 0xc8, 0x41, 0x31, 0x53, 0x7a, 0x9f,
+    0xfa, 0x73, 0x06, 0xc5, 0xae, 0x59, 0x8d, 0xe3, 0x0d, 0xfa, 0x99, 0x7f,
+    0xee, 0xe4, 0x82, 0xd4, 0x36, 0x68, 0x09, 0x92, 0x09, 0xef, 0x70, 0x89,
+    0xc6, 0xfa, 0xc7, 0x7e, 0x0f, 0x24, 0x8e, 0xad, 0x4e, 0xd9, 0x4c, 0x11,
+    0xe7, 0x7d, 0x98, 0xf0, 0x80, 0x42, 0x0b, 0x86, 0x8d, 0x8e, 0x85, 0x97,
+    0xd2, 0x11, 0x0f, 0x04, 0x59, 0xaf, 0xa5, 0xec, 0xda, 0x75, 0x64, 0x51,
+    0x22, 0x7e, 0x38, 0x4b, 0xca, 0x9e, 0x82, 0x71, 0x72, 0x8d, 0x4c, 0xca,
+    0xe1, 0x77, 0xe5, 0xe0, 0x9d, 0x64, 0x01, 0x48, 0x49, 0xcd, 0x3b, 0x90,
+    0xd8, 0x9e, 0x15, 0x22, 0x76, 0xe0, 0x57, 0x06, 0x06, 0xaf, 0x2c, 0x09,
+    0xce, 0x4c, 0xfa, 0x8b, 0xbf, 0xa1, 0x1b, 0xe3, 0xe7, 0xa5, 0xa0, 0xc0,
+    0xc8, 0x4c, 0x79, 0x1b, 0xeb, 0x5d, 0xb8, 0x3b, 0x1c, 0x3f, 0xbc, 0x11,
+    0x8f, 0xa0, 0x08, 0x2b, 0xd3, 0xe3, 0xca, 0xbc, 0x41, 0xc2, 0xa4, 0x4e,
+    0xdc, 0x0a, 0xe1, 0x06, 0xef, 0x55, 0x13, 0xb3, 0xdd, 0xfd, 0xe2, 0x89,
+    0x5f, 0xb5, 0xf6, 0xa9, 0xd7, 0xae, 0xc1, 0x14, 0xb6, 0x19, 0xd8, 0x5b,
+    0x0f, 0x9a, 0xb0, 0xed, 0xc5, 0xc7, 0xa8, 0xa6, 0x08, 0x5a, 0x00, 0xad,
+    0xf5, 0x9c, 0xb9, 0xd9, 0x45, 0x46, 0xf0, 0x9e, 0x2d, 0x55, 0xc6, 0x08,
+    0x60, 0x0d, 0x9e, 0xa7, 0x68, 0xb6, 0xf7, 0xf3, 0xa9, 0x84, 0x7e, 0x63,
+    0xe8, 0x48, 0x03, 0x1c, 0x15, 0x97, 0x94, 0xda, 0x04, 0xb2, 0xd0, 0x09,
+    0xa5, 0x62, 0x21, 0x70, 0x88, 0x9f, 0xf5, 0x0c, 0x91, 0x0d, 0xbf, 0x69,
+    0xe1, 0x6b, 0x4f, 0xc2, 0xf2, 0x32, 0xe1, 0x4b, 0xad, 0x58, 0xea, 0x0c,
+    0x07, 0x13, 0x4a, 0x1b, 0x87, 0x6d, 0x6e, 0x2f, 0xb6, 0xc6, 0x30, 0x1e,
+    0x2d, 0x1d, 0x5c, 0xdf, 0xd2, 0x5a, 0x88, 0xc8, 0x1c, 0xd9, 0xc3, 0x91,
+    0x04, 0x45, 0x63, 0x11, 0x44, 0x35, 0x7f, 0x46, 0xf4, 0xd0, 0xd1, 0x73,
+    0x9c, 0xae, 0x85, 0x5e, 0xda, 0xc7, 0xce, 0xb5, 0xbb, 0x3a, 0xb4, 0x67,
+    0xa5, 0xad, 0xc6, 0x5e, 0x12, 0xc7, 0xc5, 0x72, 0xfc, 0x35, 0x2e, 0xae,
+    0x46, 0x81, 0x22, 0x56, 0x6d, 0xc9, 0x36, 0x43, 0x17, 0x6b, 0x4d, 0x81,
+    0xd6, 0x59, 0x35, 0x90, 0x3a, 0xd2, 0xde, 0x79, 0xbd, 0x21, 0xc4, 0x56,
+    0xcb, 0x59, 0x3b, 0xe7, 0xb3, 0xab, 0x92, 0xce, 0x65, 0xc7, 0x20, 0xde,
+    0xde, 0xb1, 0x94, 0xac, 0x1a, 0x23, 0xa4, 0x14, 0x56, 0x32, 0xc0, 0x9f,
+    0x48, 0x31, 0xa6, 0x95, 0xc4, 0xb8, 0xf3, 0x9c, 0x8d, 0x34, 0x03, 0xc3,
+    0x62, 0x63, 0x38, 0x15, 0x71, 0x08, 0x5e, 0x1b, 0xc0, 0xf2, 0x54, 0x13,
+    0x66, 0x01, 0xf1, 0x38, 0xd9, 0x61, 0xf3, 0xdb, 0xd4, 0x83, 0x98, 0x3e,
+    0xaa, 0xe1, 0xca, 0x2d, 0xfb, 0x6d, 0x02, 0xac, 0xf2, 0xa6, 0x04, 0x09,
+    0xeb, 0xcb, 0xaf, 0xd5, 0x9d, 0x3d, 0xd7, 0xc2, 0xc1, 0x6f, 0xec, 0x53,
+    0x65, 0x0e, 0x40, 0x77, 0x03, 0xcd, 0x79, 0x0a, 0x94, 0x27, 0x6b, 0x6f,
+    0x32, 0xb3, 0xdb, 0x3e, 0x38, 0xe2, 0xd2, 0xca, 0x9b, 0x9e, 0x24, 0xc7,
+    0x35, 0xfd, 0xc1, 0x86, 0x78, 0xd9, 0xc3, 0xfe, 0x03, 0xb3, 0x3f, 0xc1,
+    0xf8, 0x09, 0x89, 0xdc, 0x3b, 0x08, 0xae, 0x85, 0xfa, 0x8e, 0x51, 0xbb,
+    0x6f, 0xf4, 0x73, 0x43, 0xd2, 0xed, 0x6d, 0xfd, 0x2b, 0x23, 0xc3, 0x4f,
+    0xc4, 0x1d, 0x25, 0xb9, 0x36, 0xc4, 0x98, 0xe6, 0xbf, 0xb8, 0x30, 0xcf,
+    0x1b, 0x38, 0x7f, 0xc0, 0x76, 0x67, 0xf8, 0x3f, 0x01, 0x31, 0x3b, 0x87,
+    0x60, 0xf9, 0x90, 0x01, 0x2c, 0x2f, 0xff, 0x6d, 0xfc, 0x8c, 0x3e, 0xeb,
+    0x7f, 0x96, 0x41, 0x82, 0xfd, 0xc6, 0x93, 0x8d, 0xfa, 0x4e, 0x48, 0x49,
+    0x33, 0x3a, 0xa3, 0x5e, 0x61, 0xdf, 0x88, 0x73, 0x66, 0x04, 0xf5, 0xe5,
+    0xd7, 0xea, 0xce, 0x9e, 0xeb, 0xe1, 0x60, 0xb7, 0xf1, 0xcc, 0x0d, 0xc1,
+    0xc4, 0xa0, 0x22, 0x0d, 0xe5, 0x8c, 0x8e, 0x26, 0xf9, 0x89, 0xa5, 0x02,
+    0xf6, 0x4c, 0x3f, 0x10, 0x74, 0x96, 0xe4, 0xdb, 0x12, 0x63, 0x9a, 0xfe,
+    0x70, 0x4e, 0x9a, 0x97, 0xc8, 0xad, 0x5f, 0x39, 0xa0, 0x81, 0x6a, 0xc4,
+    0x93, 0x50, 0x94, 0x1e, 0x17, 0xe3, 0x3f, 0x6d, 0x91, 0x01, 0xed, 0x49,
+    0x96, 0xed, 0x01, 0xc2, 0x2a, 0xe1, 0xc9, 0x39, 0x76, 0x1f, 0x87, 0xb6,
+    0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xbf, 0x82, 0xa3,
+    0x6d, 0x87, 0x72, 0x2c, 0x7c, 0xdc, 0x3f, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0,
+    0x0e, 0xc3, 0xdc, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x54, 0xab, 0xe3,
+    0xd6, 0x47, 0x90, 0x61, 0x87, 0x66, 0x08, 0x63, 0x95, 0x25, 0x20, 0x43,
+    0x6e, 0x05, 0x80, 0xad, 0x01, 0x10, 0xc7, 0x6c, 0x04, 0xbe, 0xaf, 0xc5,
+    0x50, 0xa7, 0x48, 0x4a, 0x47, 0x44, 0x71, 0xc9, 0xa5, 0xdb, 0xa2, 0x2b,
+    0x12, 0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0,
+    0xb4, 0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7e,
+    0xfe, 0x0a, 0x8d, 0xb6, 0x1d, 0xc8, 0xb1, 0xf3, 0x70, 0xfc, 0xad, 0xab,
+    0xc6, 0x6b, 0x80, 0xc8, 0xbb, 0x74, 0x45, 0x62, 0x57, 0x88, 0x07, 0x26,
+    0x2d, 0x30, 0x60, 0x77, 0x34, 0x08, 0xde, 0x16, 0x89, 0x63, 0x71, 0xbb,
+    0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52,
+    0x53, 0x83, 0x53, 0xf6, 0x9e, 0x15, 0xb5, 0x78, 0xcd, 0x70, 0x19, 0x17,
+    0x6e, 0x88, 0xac, 0x4a, 0xf1, 0x00, 0xe4, 0xc5, 0xa6, 0x0c, 0x0e, 0xe6,
+    0x81, 0x1b, 0xc2, 0xd1, 0x2c, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed,
+    0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x4a, 0x70, 0x6a, 0x7e, 0xd3,
+    0xc2, 0xb6, 0xaf, 0x19, 0xae, 0x03, 0x22, 0xed, 0xd1, 0x15, 0x89, 0x5e,
+    0x20, 0x1c, 0x98, 0xb4, 0xc1, 0x81, 0xdc, 0xd0, 0x23, 0x78, 0x5a, 0x25,
+    0x8d, 0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5,
+    0x48, 0x5d, 0x49, 0x4e, 0x0d, 0x4f, 0xda, 0x78, 0x56, 0xd5, 0xe3, 0x35,
+    0xc0, 0x64, 0x5d, 0xba, 0x22, 0xb1, 0x2b, 0xc4, 0x03, 0x93, 0x16, 0x98,
+    0x30, 0x3b, 0x9a, 0x04, 0x6f, 0x0b, 0x44, 0xb1, 0xb8, 0xdd, 0xa8, 0x71,
+    0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x29, 0xc1,
+    0xa9, 0xfb, 0x4f, 0x0a, 0xda, 0xbc, 0x66, 0xb8, 0x0c, 0x8b, 0xb7, 0x44,
+    0x56, 0x25, 0x78, 0x80, 0x72, 0x62, 0xd3, 0x06, 0x07, 0x73, 0x40, 0x8d,
+    0xe1, 0x68, 0x96, 0x37, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56,
+    0xfe, 0x0a, 0xd5, 0x21, 0x75, 0x25, 0x38, 0x35, 0x3f, 0x69, 0xe1, 0x5b,
+    0x57, 0x8c, 0xd7, 0x01, 0x91, 0x76, 0xe8, 0x8a, 0xc4, 0xaf, 0x10, 0x0e,
+    0x4c, 0x5a, 0x60, 0xc0, 0xee, 0x68, 0x11, 0xbc, 0x2d, 0x12, 0xc6, 0xe3,
+    0x76, 0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e,
+    0xa4, 0xa7, 0x06, 0xa7, 0xed, 0x3c, 0x2b, 0x6a, 0xf1, 0x9a, 0xe0, 0x32,
+    0x2e, 0xdd, 0x11, 0x58, 0x95, 0xe2, 0x01, 0xc9, 0x8b, 0x4c, 0x18, 0x1d,
+    0xcd, 0x02, 0x37, 0x85, 0xa2, 0x58, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03,
+    0xdb, 0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x94, 0xe0, 0xd4, 0xfd,
+    0xa7, 0x85, 0x6d, 0x5e, 0x33, 0x5c, 0x06, 0x45, 0xdb, 0xa2, 0x2b, 0x12,
+    0xbc, 0x40, 0x39, 0x31, 0x69, 0x83, 0x03, 0xb9, 0xa0, 0x46, 0xf0, 0xb4,
+    0x4b, 0x1b, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05,
+    0x6a, 0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45,
+    0xee, 0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47,
+    0x09, 0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50,
+    0xe3, 0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d,
+    0x50, 0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36,
+    0x0c, 0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec,
+    0xc6, 0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2,
+    0xad, 0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f,
+    0x4d, 0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1,
+    0x17, 0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d,
+    0xc6, 0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48,
+    0x5d, 0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62,
+    0xfe, 0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd,
+    0xae, 0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96,
+    0x07, 0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36,
+    0x83, 0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f,
+    0xf1, 0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60,
+    0xfc, 0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe,
+    0x0a, 0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86,
+    0x8b, 0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2,
+    0x8e, 0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76,
+    0xa1, 0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4,
+    0x9a, 0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78,
+    0x6c, 0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19,
+    0xd9, 0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb,
+    0xa5, 0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9,
+    0x3e, 0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97,
+    0xc2, 0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e,
+    0xdb, 0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a,
+    0x90, 0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee,
+    0xc5, 0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09,
+    0x7b, 0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3,
+    0x2c, 0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50,
+    0x6d, 0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c,
+    0x1f, 0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6,
+    0xc1, 0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad,
+    0xfc, 0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d,
+    0x0d, 0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17,
+    0x65, 0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6,
+    0xed, 0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d,
+    0x49, 0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe,
+    0xf0, 0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae,
+    0x33, 0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07,
+    0xb7, 0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83,
+    0xd2, 0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1,
+    0x2f, 0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc,
+    0x3d, 0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a,
+    0xd5, 0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b,
+    0xdd, 0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e,
+    0x12, 0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1,
+    0xc6, 0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a,
+    0xa0, 0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c,
+    0x18, 0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9,
+    0x8d, 0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5,
+    0x5b, 0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e,
+    0x9a, 0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2,
+    0x2e, 0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb,
+    0x8d, 0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90,
+    0xba, 0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5,
+    0xfd, 0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b,
+    0x5c, 0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c,
+    0x0f, 0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d,
+    0x07, 0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f,
+    0xe2, 0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1,
+    0xf8, 0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc,
+    0x15, 0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d,
+    0x17, 0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65,
+    0x1c, 0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed,
+    0x43, 0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49,
+    0x35, 0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0,
+    0xd8, 0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33,
+    0xb3, 0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7,
+    0x4a, 0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2,
+    0x7d, 0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f,
+    0x84, 0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d,
+    0xb7, 0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5,
+    0x21, 0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd,
+    0x8b, 0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12,
+    0xf6, 0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6,
+    0x58, 0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0,
+    0xda, 0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18,
+    0x3f, 0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d,
+    0x83, 0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b,
+    0xf8, 0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a,
+    0x1a, 0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e,
+    0xca, 0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d,
+    0xda, 0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba,
+    0x92, 0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd,
+    0xe1, 0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c,
+    0x67, 0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f,
+    0x6e, 0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07,
+    0xa4, 0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2,
+    0x5f, 0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8,
+    0x7b, 0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15,
+    0xaa, 0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17,
+    0xbb, 0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c,
+    0x25, 0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43,
+    0x8c, 0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35,
+    0x41, 0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8,
+    0x30, 0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3,
+    0x1b, 0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a,
+    0xb7, 0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d,
+    0x34, 0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84,
+    0x5d, 0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7,
+    0x1b, 0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21,
+    0x75, 0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b,
+    0xfb, 0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6,
+    0xb8, 0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58,
+    0x1e, 0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda,
+    0x0f, 0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f,
+    0xc4, 0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83,
+    0xf0, 0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8,
+    0x2b, 0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a,
+    0x2f, 0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca,
+    0x38, 0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda,
+    0x87, 0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92,
+    0x6a, 0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1,
+    0xb0, 0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67,
+    0x66, 0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e,
+    0x95, 0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4,
+    0xfa, 0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f,
+    0x08, 0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b,
+    0x6e, 0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa,
+    0x42, 0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb,
+    0x17, 0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25,
+    0xed, 0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c,
+    0xb0, 0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41,
+    0xb4, 0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30,
+    0x7f, 0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b,
+    0x07, 0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7,
+    0xf0, 0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34,
+    0x34, 0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d,
+    0x94, 0x70, 0x97, 0xb5, 0xc6, 0x76, 0x63, 0x60, 0xfc, 0x3d, 0xb7, 0x1b,
+    0xb5, 0x0e, 0x32, 0xc0, 0xf6, 0xe9, 0x56, 0xfe, 0x0a, 0xd5, 0x21, 0x75,
+    0x24, 0xd5, 0x06, 0xd0, 0x7a, 0x4f, 0xa6, 0x86, 0x8b, 0xdd, 0x8b, 0xfb,
+    0xc3, 0x60, 0xc1, 0xfe, 0x25, 0xf0, 0x8b, 0xb2, 0x8e, 0x12, 0xf6, 0xb8,
+    0xce, 0xcc, 0x6c, 0x1f, 0x87, 0xb6, 0xe3, 0x76, 0xa1, 0xc6, 0x58, 0x1e,
+    0xdd, 0x2a, 0xdf, 0xc1, 0x5a, 0xa4, 0x2e, 0xa4, 0x9a, 0xa0, 0xda, 0x0f,
+    0x49, 0xf4, 0xd0, 0xd1, 0x7b, 0xb1, 0x7f, 0x78, 0x6c, 0x18, 0x3f, 0xc4,
+    0xbe, 0x11, 0x76, 0x51, 0xc2, 0x5e, 0xd7, 0x19, 0xd9, 0x8d, 0x83, 0xf0,
+    0xf6, 0xdc, 0x6e, 0xd4, 0x38, 0xcb, 0x03, 0xdb, 0xa5, 0x5b, 0xf8, 0x2b,
+    0x54, 0x85, 0xd4, 0x93, 0x54, 0x1b, 0x41, 0xe9, 0x3e, 0x9a, 0x1a, 0x2f,
+    0x76, 0x2f, 0xef, 0x0d, 0x83, 0x07, 0xf8, 0x97, 0xc2, 0x2e, 0xca, 0x38,
+    0x4b, 0xda, 0xe3, 0x3b, 0x31, 0xb0, 0x7e, 0x1e, 0xdb, 0x8d, 0xda, 0x87,
+    0x19, 0x60, 0x7b, 0x74, 0xab, 0x7f, 0x05, 0x6a, 0x90, 0xba, 0x92, 0x6a,
+    0x83, 0x68, 0x3d, 0x27, 0xd3, 0x43, 0x45, 0xee, 0xc5, 0xfd, 0xe1, 0xb0,
+    0x60, 0xff, 0x12, 0xf8, 0x45, 0xd9, 0x47, 0x09, 0x7b, 0x5c, 0x67, 0x66,
+    0x36, 0x0f, 0xc3, 0xdb, 0x71, 0xbb, 0x50, 0xe3, 0x2c, 0x0f, 0x6e, 0x95,
+    0x6f, 0xe0, 0xad, 0x52, 0x17, 0x52, 0x4d, 0x50, 0x6d, 0x07, 0xa4, 0xfa,
+    0x68, 0x68, 0xbd, 0xd8, 0xbf, 0xbc, 0x36, 0x0c, 0x1f, 0xe2, 0x5f, 0x08,
+    0xbb, 0x28, 0xe1, 0x2f, 0x6b, 0x8c, 0xec, 0xc6, 0xc1, 0xf8, 0x7b, 0x6e,
+    0x37, 0x6a, 0x1c, 0x65, 0x81, 0xed, 0xd2, 0xad, 0xfc, 0x15, 0xaa, 0x42,
+    0xea, 0x49, 0xaa, 0x0d, 0xa0, 0xf4, 0x9f, 0x4d, 0x0d, 0x17, 0xbb, 0x17,
+    0xf7, 0x86, 0xc1, 0x83, 0xfc, 0x4b, 0xe1, 0x17, 0x65, 0x1c, 0x25, 0xed,
+    0x71, 0x9d, 0x98, 0xd8, 0x3f, 0x0f, 0x6d, 0xc6, 0xed, 0x43, 0x8c, 0xb0,
+    0x3d, 0xba, 0x55, 0xbf, 0x82, 0xb5, 0x48, 0x5d, 0x49, 0x35, 0x41, 0xb4,
+    0x1e, 0x93, 0xe9, 0xa1, 0xa2, 0xf7, 0x62, 0xfe, 0xf0, 0xd8, 0x30, 0x7f,
+    0x89, 0x7c, 0x22, 0xec, 0xa3, 0x84, 0xbd, 0xae, 0x33, 0xb3, 0x1b, 0x07,
+    0xe1, 0xed, 0xb8, 0xdd, 0xa8, 0x71, 0x96, 0x07, 0xb7, 0x4a, 0xb7, 0xf0,
+    0x56, 0xa9, 0x0b, 0xa9, 0x26, 0xa8, 0x36, 0x83, 0xd2, 0x7d, 0x34, 0x34,
+    0x5e, 0xec, 0x5f, 0xde, 0x1b, 0x06, 0x0f, 0xf1, 0x2f, 0x84, 0x5d, 0x94,
+    0x70, 0x97, 0xb5, 0xc6, 0x7c,
+};
+static_assert(sizeof(kBytesTestReadSymbol12) == kNumBytesTestReadSymbol12, "");
+
+// The kBytesTestReadSymbol13[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][14] = {
+//   // pmf: 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13, 1/13,
+//   // 1/13, 1/13
+//   { 32768 - 2521, 32768 - 5041, 32768 - 7562, 32768 - 10082, 32768 - 12603,
+//     32768 - 15124, 32768 - 17644, 32768 - 20165, 32768 - 22686,
+//     32768 - 25206, 32768 - 27727, 32768 - 30247, 0, 0 },
+//   // pmf: 3/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 1/26
+//   { 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343, 32768 - 13863,
+//     32768 - 16384, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+//     32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+//   // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 3/26
+//   { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+//     32768 - 13863, 32768 - 16384, 32768 - 18905, 32768 - 21425,
+//     32768 - 23946, 32768 - 26466, 32768 - 28987, 0, 0 },
+//   // pmf: 1/26, 2/26, 2/26, 2/26, 2/26, 2/26, 4/26, 2/26, 2/26, 2/26, 2/26,
+//   // 2/26, 1/26
+//   { 32768 - 1260, 32768 - 3781, 32768 - 6302, 32768 - 8822, 32768 - 11343,
+//     32768 - 13863, 32768 - 18905, 32768 - 21425, 32768 - 23946,
+//     32768 - 26466, 32768 - 28987, 32768 - 31508, 0, 0 },
+// };
+// constexpr int kSymbols[26][4] = { { 0, 6, 12, 5 },     //
+//                                   { 1, 7, 11, 4 },     //
+//                                   { 2, 8, 10, 3 },     //
+//                                   { 3, 9, 9, 2 },      //
+//                                   { 4, 10, 8, 1 },     //
+//                                   { 5, 11, 7, 0 },     //
+//                                   { 6, 12, 6, 12 },    //
+//                                   { 7, 0, 5, 11 },     //
+//                                   { 8, 1, 4, 10 },     //
+//                                   { 9, 2, 3, 9 },      //
+//                                   { 10, 3, 2, 8 },     //
+//                                   { 11, 4, 1, 7 },     //
+//                                   { 12, 5, 0, 6 },     //
+//                                   { 0, 0, 12, 11 },    //
+//                                   { 2, 1, 10, 9 },     //
+//                                   { 4, 3, 8, 7 },      //
+//                                   { 6, 5, 6, 5 },      //
+//                                   { 8, 7, 4, 3 },      //
+//                                   { 10, 9, 2, 1 },     //
+//                                   { 12, 11, 12, 10 },  //
+//                                   { 1, 0, 11, 8 },     //
+//                                   { 3, 2, 9, 6 },      //
+//                                   { 5, 4, 7, 4 },      //
+//                                   { 7, 6, 5, 2 },      //
+//                                   { 9, 8, 3, 6 },      //
+//                                   { 11, 10, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+//   for (int j = 0; j < 26; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 13);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol13 = 3110;
+constexpr uint8_t kBytesTestReadSymbol13[] = {
+    0x0b, 0x38, 0xa7, 0x3e, 0xde, 0x47, 0x2e, 0xe6, 0x9e, 0xe0, 0xa8, 0xc4,
+    0x77, 0xda, 0x41, 0x64, 0x49, 0x60, 0xc4, 0x26, 0x68, 0xac, 0xf4, 0xa6,
+    0x8c, 0x6e, 0xa6, 0xd3, 0xd9, 0x4b, 0xb9, 0x35, 0xb6, 0x53, 0x6c, 0x73,
+    0x13, 0xd7, 0xfb, 0xbf, 0x96, 0xac, 0xea, 0x86, 0xb5, 0x24, 0x14, 0x2a,
+    0x5a, 0x41, 0x38, 0xab, 0xfb, 0x92, 0x74, 0xf4, 0x0f, 0x24, 0xde, 0x2d,
+    0x2d, 0x12, 0xd7, 0xb8, 0x2f, 0x4a, 0x4c, 0xd6, 0xc0, 0x4b, 0x01, 0x98,
+    0xca, 0x7e, 0xde, 0x03, 0x75, 0x27, 0x59, 0x4f, 0x32, 0x54, 0xa5, 0xb5,
+    0x79, 0xc3, 0xc4, 0x3c, 0x76, 0xa3, 0x2f, 0xaf, 0x2f, 0x0a, 0x84, 0xb5,
+    0x60, 0xf5, 0x73, 0x88, 0xc0, 0x24, 0x1c, 0xfb, 0xff, 0x90, 0xb6, 0x05,
+    0xe9, 0x43, 0x90, 0xc8, 0xd3, 0xfd, 0x3f, 0xc2, 0x0b, 0xb5, 0xfe, 0x12,
+    0x55, 0x23, 0xa1, 0xf4, 0xba, 0xc7, 0x1f, 0xc3, 0xe5, 0xe3, 0x76, 0x68,
+    0x3c, 0x57, 0xb9, 0x92, 0xea, 0x25, 0x93, 0x4e, 0x72, 0xff, 0x63, 0x28,
+    0x0c, 0x90, 0x1d, 0xb6, 0x42, 0xb2, 0x25, 0x79, 0x8e, 0xee, 0x0c, 0x56,
+    0x3d, 0x94, 0x3d, 0x80, 0xf2, 0x25, 0x6f, 0xd4, 0x93, 0x31, 0x18, 0x80,
+    0x5a, 0x3a, 0xbb, 0x4d, 0xbb, 0x77, 0xc3, 0xb0, 0x20, 0x0e, 0xd3, 0xd8,
+    0x10, 0x05, 0xb2, 0x81, 0x57, 0xf5, 0x8c, 0xe5, 0xac, 0x46, 0xc0, 0xae,
+    0x9c, 0x08, 0x9d, 0x51, 0xf3, 0x16, 0xb9, 0xd7, 0x90, 0xa7, 0x9f, 0x40,
+    0x5d, 0x14, 0xd1, 0xbd, 0xa2, 0x0b, 0xf3, 0xae, 0x3b, 0xfb, 0x0f, 0xe1,
+    0x1a, 0x6e, 0x63, 0x3b, 0xdb, 0x41, 0x8e, 0xe8, 0x1f, 0x20, 0x18, 0xbe,
+    0x69, 0x10, 0x86, 0x06, 0x06, 0x23, 0x3a, 0x40, 0xc1, 0x7f, 0x2e, 0x32,
+    0xb4, 0x23, 0xac, 0x4b, 0x25, 0x6b, 0xef, 0xaf, 0xec, 0x5c, 0xf2, 0xd0,
+    0x61, 0xb2, 0x3a, 0xa5, 0x3d, 0xcd, 0xf7, 0x99, 0x6b, 0x4e, 0xbb, 0x58,
+    0x6a, 0x4c, 0xd7, 0xc0, 0x77, 0xd9, 0xae, 0x15, 0x7e, 0xde, 0xc9, 0xd8,
+    0x24, 0x39, 0x3f, 0xa4, 0xf3, 0x24, 0x7e, 0xe0, 0x22, 0x19, 0x40, 0x3d,
+    0x0c, 0xb0, 0xb7, 0xe3, 0x4b, 0x82, 0x6f, 0x82, 0x0e, 0xb1, 0x91, 0xef,
+    0x84, 0x98, 0x69, 0x66, 0x24, 0xe7, 0x90, 0x13, 0x0d, 0xbd, 0x6b, 0x92,
+    0xee, 0x1c, 0x0f, 0xe7, 0xfa, 0xb9, 0xb4, 0x6c, 0x68, 0x98, 0x4c, 0x27,
+    0x42, 0xad, 0x5f, 0x8f, 0xe5, 0x25, 0xf9, 0x67, 0x84, 0x86, 0x2e, 0xf6,
+    0x51, 0x71, 0x0d, 0x6c, 0x45, 0x8f, 0x96, 0x15, 0x73, 0xab, 0xff, 0xc0,
+    0x87, 0x14, 0xba, 0x00, 0x67, 0x2c, 0x27, 0x03, 0xff, 0xa6, 0xe3, 0x09,
+    0xae, 0xbb, 0xa5, 0x49, 0xee, 0x5f, 0x47, 0xc0, 0x30, 0x4a, 0x93, 0x28,
+    0x48, 0x4d, 0x30, 0x49, 0xe7, 0xe6, 0x79, 0x96, 0x75, 0x6c, 0x62, 0xbc,
+    0x9f, 0xaa, 0x39, 0x63, 0x1d, 0x33, 0xce, 0xd2, 0xa3, 0xd1, 0x93, 0xed,
+    0x8d, 0xa6, 0xbd, 0x02, 0xf0, 0x44, 0xd5, 0x9e, 0x29, 0x02, 0x46, 0x87,
+    0xaf, 0xdb, 0xfb, 0x20, 0x29, 0x26, 0xb7, 0x8c, 0x75, 0xee, 0xe9, 0x29,
+    0x53, 0x01, 0x4a, 0xaa, 0xc2, 0x9f, 0x6c, 0x30, 0x21, 0x83, 0xa6, 0x09,
+    0x32, 0x1d, 0xaa, 0x00, 0x6c, 0xea, 0x9c, 0x84, 0x16, 0x16, 0x0c, 0x06,
+    0xcc, 0xf0, 0x19, 0xce, 0x57, 0xb3, 0x9f, 0x57, 0xf0, 0xdc, 0xda, 0x86,
+    0x85, 0x2f, 0x09, 0x33, 0x8d, 0x59, 0xb8, 0xc1, 0x08, 0x4c, 0xee, 0xf8,
+    0x33, 0x3d, 0x23, 0x13, 0x78, 0xa3, 0x98, 0xbf, 0xab, 0xef, 0x15, 0xe2,
+    0x8d, 0xdb, 0xb4, 0xd0, 0x4b, 0x2f, 0x04, 0x3f, 0x6b, 0x11, 0xf0, 0x05,
+    0xc7, 0x53, 0x1e, 0xc9, 0x73, 0x11, 0x81, 0xd3, 0xde, 0x21, 0xd8, 0x14,
+    0x10, 0xbe, 0x30, 0xb2, 0x48, 0x55, 0x9b, 0x8c, 0x10, 0x84, 0xce, 0xef,
+    0x83, 0x2f, 0x03, 0x10, 0x09, 0x0f, 0x70, 0xa8, 0x84, 0xea, 0x15, 0xdb,
+    0xc7, 0xdf, 0x6f, 0x67, 0x5d, 0x1c, 0xc7, 0x1a, 0x1c, 0x15, 0xa6, 0x92,
+    0xed, 0x63, 0xf0, 0xed, 0x77, 0x5d, 0x12, 0x1b, 0x8c, 0xab, 0x3e, 0xfa,
+    0x12, 0xf6, 0x83, 0xda, 0x41, 0xbc, 0x97, 0x76, 0xb9, 0x1f, 0xc9, 0x36,
+    0xc7, 0xe3, 0x9f, 0x93, 0x2e, 0x27, 0xdc, 0x90, 0x84, 0x6d, 0x81, 0x04,
+    0x09, 0x4f, 0x10, 0xb9, 0x53, 0xd9, 0x8f, 0x99, 0x2b, 0x8b, 0x53, 0x4f,
+    0xe8, 0x3e, 0x82, 0x1b, 0x0c, 0x3d, 0xbc, 0xe5, 0x5c, 0x13, 0xed, 0x4b,
+    0x0b, 0x05, 0x72, 0xaa, 0xd2, 0xcf, 0xfc, 0x9f, 0xd0, 0xfd, 0xc7, 0xc6,
+    0xc0, 0xa3, 0xa7, 0x05, 0xbb, 0x9e, 0xae, 0x63, 0xc0, 0x3d, 0x73, 0x92,
+    0xe1, 0x98, 0xe4, 0xa5, 0xb3, 0xc4, 0x36, 0x90, 0x35, 0x6b, 0xab, 0x35,
+    0x06, 0x98, 0xca, 0x35, 0x20, 0x5a, 0x6a, 0x84, 0x5c, 0x88, 0xca, 0x64,
+    0x43, 0x87, 0xf2, 0x3c, 0x13, 0x58, 0x1c, 0x35, 0x2c, 0xf2, 0x1d, 0x5e,
+    0xe0, 0x1b, 0x2c, 0x59, 0xc2, 0xcd, 0xf2, 0x96, 0x1a, 0x75, 0x3c, 0x10,
+    0xe7, 0xe3, 0xa1, 0xbc, 0xec, 0x03, 0x79, 0x58, 0x26, 0x4d, 0xcf, 0xb4,
+    0x00, 0xd3, 0x46, 0xee, 0x99, 0x52, 0x2f, 0x54, 0xcb, 0xa1, 0x75, 0xa1,
+    0xa0, 0xf4, 0xaa, 0xe9, 0x4a, 0xe1, 0x74, 0xcc, 0xd1, 0x47, 0xda, 0x48,
+    0x8b, 0x2e, 0xf9, 0x54, 0x98, 0x4e, 0x4f, 0x5a, 0x1b, 0xf5, 0x66, 0x62,
+    0xa0, 0xc2, 0x0e, 0x1a, 0x91, 0xbd, 0x7a, 0x33, 0xfd, 0x7c, 0xfc, 0x8b,
+    0xc0, 0x92, 0xd8, 0x97, 0x48, 0x6f, 0xf4, 0xe0, 0x6c, 0xcf, 0x17, 0xc9,
+    0x44, 0x04, 0xcf, 0x50, 0x0d, 0x8f, 0xbc, 0x4f, 0x4e, 0x1d, 0x38, 0x38,
+    0x5c, 0xb7, 0x8e, 0xe7, 0x52, 0xbe, 0x04, 0x68, 0x79, 0x9e, 0x68, 0x32,
+    0x3b, 0xe4, 0xee, 0x65, 0x76, 0xf6, 0xb4, 0x47, 0x1c, 0xa5, 0xd0, 0x20,
+    0x0f, 0x94, 0xe1, 0x2f, 0xa8, 0x87, 0xeb, 0xda, 0x2c, 0x54, 0xc4, 0x07,
+    0x08, 0x89, 0xdc, 0xcf, 0x73, 0x0c, 0x1f, 0xea, 0xb4, 0x6d, 0xea, 0x17,
+    0x70, 0x82, 0xb5, 0x18, 0x2f, 0x38, 0xc5, 0x47, 0x47, 0xd6, 0x37, 0x20,
+    0x8d, 0x71, 0xd6, 0x16, 0x4d, 0x16, 0xd5, 0x77, 0x36, 0xb5, 0xd0, 0x20,
+    0x5f, 0x4d, 0x89, 0x6c, 0x49, 0xc4, 0x13, 0x6c, 0x26, 0x8c, 0x8f, 0x6f,
+    0x17, 0xab, 0xdf, 0x57, 0xa8, 0xab, 0xed, 0x8d, 0xa9, 0x00, 0x6b, 0xfc,
+    0xf6, 0x72, 0xaf, 0x32, 0xc2, 0x0b, 0xb6, 0x6b, 0x7a, 0xac, 0xa9, 0x77,
+    0x52, 0x87, 0x98, 0x43, 0x21, 0x72, 0x35, 0x6c, 0x27, 0x12, 0xbe, 0xf0,
+    0x62, 0x16, 0x2a, 0xc6, 0xf7, 0x48, 0xd2, 0xc3, 0x25, 0xb4, 0x6a, 0x57,
+    0x65, 0xd6, 0x07, 0xa0, 0xde, 0x9f, 0x3b, 0x3d, 0xdd, 0x27, 0x0e, 0x4c,
+    0xe8, 0x4b, 0xe1, 0xd6, 0x33, 0xa7, 0x85, 0x75, 0x44, 0x7e, 0xf9, 0xfd,
+    0xb9, 0x98, 0xa8, 0x30, 0x82, 0xdf, 0xd9, 0x97, 0x5c, 0x3f, 0x52, 0x20,
+    0xd4, 0x38, 0x88, 0xc1, 0x53, 0x11, 0x14, 0x25, 0x6f, 0xeb, 0x4e, 0xf5,
+    0xed, 0xf4, 0xba, 0x34, 0x23, 0x74, 0xbc, 0x46, 0x51, 0x96, 0x1b, 0x50,
+    0x32, 0x03, 0xe5, 0x6d, 0xd7, 0xcf, 0xca, 0x60, 0xb2, 0xbc, 0xb6, 0x4b,
+    0xc0, 0xee, 0x8b, 0x96, 0xa9, 0x4c, 0x1d, 0x9b, 0x2d, 0x11, 0xc7, 0x29,
+    0x74, 0x08, 0x03, 0xe5, 0x1c, 0xe2, 0x6c, 0x21, 0x1e, 0x02, 0x4d, 0xb1,
+    0x4e, 0x70, 0xb3, 0xfc, 0x06, 0xa5, 0xf9, 0xfb, 0x35, 0x1c, 0x89, 0xe3,
+    0x1e, 0x27, 0xe0, 0x93, 0xd6, 0xd5, 0x15, 0x94, 0x40, 0x88, 0x71, 0xfd,
+    0xaa, 0xbd, 0xf6, 0xae, 0x61, 0x52, 0x49, 0x33, 0x99, 0x85, 0xcd, 0x13,
+    0x70, 0x7e, 0x1b, 0x76, 0x3a, 0x69, 0x9e, 0xfe, 0x3c, 0x65, 0x22, 0xf0,
+    0x1f, 0x91, 0x57, 0x00, 0x5b, 0x28, 0xac, 0x1e, 0x1e, 0x24, 0xc7, 0xd8,
+    0xdb, 0x3a, 0xd0, 0x85, 0x04, 0x4d, 0xf7, 0xe8, 0x3b, 0xdc, 0xa1, 0x5b,
+    0x5e, 0xe3, 0x7a, 0xae, 0x72, 0x70, 0x7c, 0x52, 0x07, 0xf5, 0x1c, 0xda,
+    0xd7, 0x40, 0x81, 0x7d, 0x36, 0x0a, 0x97, 0x8e, 0x0c, 0x25, 0xe7, 0xd3,
+    0x81, 0xb0, 0xe2, 0xd0, 0x56, 0x16, 0x9c, 0x9d, 0x0e, 0xc7, 0x97, 0x8f,
+    0xff, 0x68, 0xd4, 0x4f, 0x1a, 0x4c, 0x58, 0x6f, 0xe4, 0xd5, 0xc1, 0x07,
+    0x7f, 0x31, 0x8c, 0x59, 0x02, 0x6f, 0xa7, 0x54, 0x1b, 0x02, 0x35, 0xe5,
+    0x14, 0xec, 0x35, 0x3d, 0x17, 0x72, 0x11, 0x0c, 0x38, 0x62, 0x99, 0x4a,
+    0x6a, 0x46, 0xcb, 0x36, 0x1b, 0x4b, 0x38, 0xff, 0x1d, 0xa4, 0xf7, 0x21,
+    0xda, 0x73, 0x42, 0xc4, 0x2b, 0xf8, 0xd8, 0x43, 0x73, 0x60, 0x11, 0x22,
+    0xc9, 0xe6, 0x07, 0xca, 0xa0, 0x29, 0x2a, 0x20, 0xd9, 0xdd, 0x7d, 0xed,
+    0x28, 0x10, 0xde, 0xbe, 0x5e, 0xfd, 0x0c, 0x06, 0x4b, 0x1c, 0xc4, 0x56,
+    0xc4, 0x12, 0x25, 0x5a, 0xd1, 0xfe, 0x03, 0x5e, 0x5e, 0xe0, 0x42, 0x8e,
+    0x44, 0xf1, 0x8f, 0x13, 0xf0, 0x49, 0xeb, 0x59, 0xf3, 0x5b, 0x61, 0xd9,
+    0xa4, 0xdf, 0x2e, 0x2a, 0x70, 0xc2, 0xf0, 0xef, 0x16, 0xf4, 0x1b, 0x5c,
+    0xbd, 0x77, 0x42, 0xb9, 0x4c, 0x56, 0x8d, 0xc8, 0xf8, 0x05, 0xbd, 0x52,
+    0xba, 0x6e, 0xe1, 0x89, 0xe1, 0xf2, 0xdb, 0xa7, 0xdf, 0xe0, 0xee, 0xc1,
+    0x5c, 0x9e, 0x90, 0x11, 0x17, 0xd5, 0xc1, 0xb9, 0x2c, 0x08, 0x62, 0x0d,
+    0x75, 0x05, 0xb2, 0xad, 0x22, 0xd6, 0x5c, 0x6e, 0xed, 0xa4, 0x06, 0x5a,
+    0x42, 0x4f, 0xbf, 0x84, 0x53, 0xfa, 0x0b, 0xb7, 0x47, 0x6c, 0xba, 0x07,
+    0xc9, 0xe4, 0x8c, 0xe4, 0xa3, 0x40, 0xdc, 0xcb, 0x58, 0xeb, 0xba, 0xc5,
+    0xcc, 0x56, 0x74, 0x1e, 0x7b, 0x0f, 0x2a, 0xce, 0x35, 0x46, 0x39, 0x6d,
+    0x81, 0x91, 0xb2, 0x05, 0x76, 0xfa, 0x8f, 0x43, 0x46, 0x25, 0xb7, 0x98,
+    0x4e, 0x5f, 0x63, 0xf4, 0x0e, 0x4f, 0x5d, 0x85, 0x29, 0x9d, 0xdb, 0xa8,
+    0xeb, 0x0a, 0xbb, 0xc4, 0xf8, 0x5a, 0xda, 0xe1, 0x9b, 0x1f, 0x9b, 0x4d,
+    0x62, 0x65, 0x41, 0x34, 0x5b, 0x6c, 0x19, 0xa5, 0x3c, 0x35, 0x8e, 0x14,
+    0x02, 0xcd, 0x1d, 0xf3, 0xfb, 0x70, 0x93, 0x46, 0xe2, 0x49, 0xc8, 0x31,
+    0xfd, 0x47, 0x35, 0xfc, 0x7d, 0xb9, 0x79, 0xf7, 0x0d, 0xed, 0x98, 0x47,
+    0xd2, 0xcf, 0x26, 0x8b, 0x10, 0x6f, 0x86, 0xca, 0xda, 0xb8, 0x41, 0xdb,
+    0x0c, 0xc7, 0xc3, 0x56, 0xc5, 0x0f, 0xc7, 0xf2, 0xda, 0x45, 0xdf, 0x94,
+    0xc1, 0x65, 0x79, 0x6c, 0x97, 0x81, 0xbd, 0xf1, 0x1e, 0x26, 0x6e, 0xfc,
+    0x4f, 0x2e, 0x1e, 0x9c, 0xa2, 0x69, 0x54, 0x7a, 0xc3, 0x15, 0x44, 0x64,
+    0x73, 0x11, 0x5b, 0x10, 0x48, 0x95, 0x6b, 0x49, 0x4e, 0xcb, 0x2b, 0x12,
+    0x90, 0xaf, 0xf5, 0x5a, 0xfa, 0xf5, 0x0b, 0xb8, 0x49, 0x0a, 0x7d, 0xc4,
+    0x6b, 0x0a, 0xa5, 0x6d, 0x32, 0xb2, 0x33, 0x3c, 0xb3, 0x65, 0x9c, 0x1f,
+    0x7e, 0x50, 0xd3, 0x6a, 0xa2, 0xc1, 0xb9, 0xd9, 0xfa, 0x25, 0xfe, 0x1c,
+    0x3f, 0x88, 0x47, 0x0a, 0x7e, 0x62, 0xa2, 0xf3, 0x3e, 0xae, 0x9f, 0x7f,
+    0x83, 0xbb, 0x05, 0x72, 0x7a, 0x40, 0x44, 0x5f, 0x57, 0x06, 0xe4, 0xb0,
+    0x21, 0x88, 0x35, 0xd4, 0x16, 0xca, 0xb4, 0x8b, 0x59, 0x71, 0xbb, 0xb6,
+    0x90, 0x19, 0x69, 0x09, 0x3e, 0xfe, 0x11, 0x4f, 0xe8, 0x2e, 0xdd, 0x1d,
+    0xb2, 0xe8, 0x1f, 0x27, 0x92, 0x33, 0x92, 0x8d, 0x04, 0x2e, 0x19, 0x16,
+    0xb4, 0xb5, 0xcf, 0x52, 0x98, 0xcc, 0x2b, 0x85, 0x0c, 0x2d, 0x88, 0x38,
+    0x24, 0x06, 0xf2, 0x47, 0xec, 0xce, 0xc6, 0xf7, 0x4e, 0xe4, 0x8b, 0xb5,
+    0x4f, 0xbe, 0xae, 0x13, 0xd5, 0x0c, 0xe6, 0x13, 0x44, 0xa4, 0x76, 0x19,
+    0x8c, 0x25, 0x28, 0x0f, 0x15, 0x8e, 0xa6, 0x9c, 0xee, 0x6e, 0xf0, 0x55,
+    0x9d, 0x5a, 0x8f, 0xf6, 0x08, 0x27, 0x92, 0x1f, 0xcb, 0x4c, 0x8c, 0x2c,
+    0xeb, 0x44, 0x26, 0x48, 0xec, 0x2e, 0x9b, 0xb3, 0xd9, 0x17, 0xee, 0x52,
+    0x7d, 0x32, 0x47, 0x88, 0x4d, 0xf9, 0x11, 0xfc, 0xac, 0xa3, 0xb0, 0xc9,
+    0x5e, 0x38, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0,
+    0x28, 0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c,
+    0x28, 0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b,
+    0x0c, 0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a,
+    0x53, 0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71,
+    0x84, 0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a,
+    0xb1, 0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d,
+    0xba, 0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c,
+    0x89, 0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85,
+    0x96, 0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0,
+    0x45, 0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01,
+    0x49, 0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7,
+    0xe8, 0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92,
+    0x9d, 0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70,
+    0x92, 0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79,
+    0x66, 0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87,
+    0x80, 0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83,
+    0x81, 0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4,
+    0xc6, 0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08,
+    0xef, 0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb,
+    0xab, 0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c,
+    0xe3, 0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d,
+    0x99, 0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b,
+    0xe6, 0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72,
+    0x7a, 0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2,
+    0xd6, 0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d,
+    0x0d, 0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0,
+    0x61, 0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19,
+    0xf2, 0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8,
+    0x14, 0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48,
+    0x82, 0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93,
+    0xf9, 0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00,
+    0x78, 0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44,
+    0xab, 0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7,
+    0xa8, 0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95,
+    0x91, 0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde,
+    0xc1, 0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa,
+    0x77, 0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab,
+    0xcd, 0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b,
+    0x42, 0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7,
+    0x57, 0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88,
+    0x37, 0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95,
+    0x83, 0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba,
+    0x35, 0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44,
+    0x7e, 0x81, 0xc9, 0xeb, 0xb0, 0xa5, 0x33, 0xbb, 0x75, 0x1d, 0x61, 0x57,
+    0x78, 0x9f, 0x0b, 0x5b, 0xa7, 0x18, 0x41, 0x68, 0x1b, 0x0d, 0xf6, 0x71,
+    0x29, 0xfe, 0x74, 0x37, 0x03, 0xab, 0x1d, 0xc4, 0x03, 0x1e, 0x7d, 0xd6,
+    0xb7, 0x4f, 0xc1, 0x87, 0xd0, 0xdb, 0xad, 0xa6, 0x76, 0x65, 0x64, 0x34,
+    0x28, 0x0c, 0x67, 0xcb, 0x36, 0xc8, 0x92, 0xec, 0x70, 0xd9, 0x78, 0xab,
+    0xe1, 0xa3, 0x60, 0x51, 0x08, 0x59, 0x6a, 0x8b, 0xd2, 0x98, 0x55, 0x2d,
+    0xca, 0x39, 0x22, 0x0a, 0x1b, 0x04, 0x55, 0xa7, 0xec, 0x34, 0xc0, 0xb6,
+    0xf3, 0xae, 0x4f, 0xe5, 0x50, 0x14, 0x95, 0x10, 0x6e, 0x7b, 0xb9, 0x1d,
+    0x27, 0x78, 0x01, 0xe0, 0x7c, 0x7e, 0x86, 0x03, 0x25, 0x8e, 0x62, 0x2b,
+    0x62, 0x09, 0x12, 0xad, 0x69, 0x29, 0xd9, 0x65, 0x62, 0x52, 0x15, 0xfe,
+    0xab, 0x5f, 0x5e, 0xa1, 0x77, 0x09, 0x21, 0x4f, 0xb8, 0x8d, 0x61, 0x54,
+    0xad, 0xa6, 0x56, 0x46, 0x67, 0x96, 0x6c, 0xb3, 0x83, 0xef, 0xca, 0x1a,
+    0x09, 0x6f, 0x7b, 0x04, 0xd8, 0x78, 0x00, 0x51, 0xe8, 0x5d, 0x8b, 0x8f,
+    0x85, 0x03, 0xe9, 0xdc, 0x18, 0x38, 0x11, 0x55, 0x95, 0xd4, 0x9d, 0x0e,
+    0xd2, 0x52, 0xaf, 0x35, 0x9e, 0x4c, 0x64, 0xbc, 0x20, 0xf1, 0x99, 0x22,
+    0xf7, 0xf9, 0x6d, 0x09, 0x00, 0x8e, 0xf4, 0xf5, 0x7a, 0x53, 0xee, 0xcb,
+    0xea, 0x53, 0x1d, 0x5c, 0xbb, 0xba, 0xbd, 0xe3, 0xbc, 0x86, 0x28, 0xe3,
+    0x55, 0xb2, 0x20, 0xdf, 0x14, 0xce, 0x38, 0x74, 0x0a, 0x1f, 0x70, 0x59,
+    0x66, 0x4e, 0x56, 0x0d, 0xa8, 0xd9, 0x9c, 0x53, 0x0a, 0x32, 0xe7, 0xed,
+    0x56, 0x12, 0xe8, 0xd7, 0x13, 0xbe, 0x63, 0x56, 0xc3, 0x3c, 0x4c, 0xae,
+    0xf8, 0xed, 0x11, 0xfa, 0x07, 0x27, 0xae, 0xc2, 0x94, 0xce, 0xed, 0xd4,
+    0x75, 0x85, 0x5d, 0xe2, 0x7c, 0x2d, 0x6e, 0x9c, 0x61, 0x05, 0xa0, 0x6c,
+    0x37, 0xd9, 0xc4, 0xa7, 0xf9, 0xd0, 0xdc, 0x0e, 0xac, 0x77, 0x10, 0x0c,
+    0x79, 0xf7, 0x5a, 0xdd, 0x3f, 0x06, 0x1f, 0x43, 0x6e, 0xb6, 0x99, 0xd9,
+    0x95, 0x90, 0xd0, 0xa0, 0x31, 0x9f, 0x2c, 0xdb, 0x22, 0x4b, 0xb1, 0xc3,
+    0x65, 0xe2, 0xaf, 0x86, 0x8d, 0x81, 0x44, 0x21, 0x65, 0xaa, 0x2f, 0x4a,
+    0x61, 0x54, 0xb7, 0x28, 0xe4, 0x88, 0x28, 0x6c, 0x11, 0x56, 0x9f, 0xb0,
+    0xd3, 0x02, 0xdb, 0xce, 0xb9, 0x3f, 0x95, 0x40, 0x52, 0x54, 0x41, 0xb9,
+    0xee, 0xe4, 0x74, 0x9d, 0xe0, 0x07, 0x81, 0xf1, 0xfa, 0x18, 0x0c, 0x96,
+    0x39, 0x88, 0xad, 0x88, 0x24, 0x4a, 0xb5, 0xa4, 0xa7, 0x65, 0x95, 0x89,
+    0x48, 0x57, 0xfa, 0xad, 0x7d, 0x7a, 0x85, 0xdc, 0x24, 0x85, 0x3e, 0xe2,
+    0x35, 0x85, 0x52, 0xb6, 0x99, 0x59, 0x19, 0x9e, 0x59, 0xb2, 0xce, 0x0f,
+    0xbf, 0x28, 0x68, 0x25, 0xbd, 0xec, 0x13, 0x61, 0xe0, 0x01, 0x47, 0xa1,
+    0x76, 0x2e, 0x3e, 0x14, 0x0f, 0xa7, 0x70, 0x60, 0xe0, 0x45, 0x56, 0x57,
+    0x52, 0x74, 0x3b, 0x49, 0x4a, 0xbc, 0xd6, 0x79, 0x31, 0x92, 0xf0, 0x83,
+    0xc6, 0x64, 0x8b, 0xdf, 0xe5, 0xb4, 0x24, 0x02, 0x3b, 0xd3, 0xd5, 0xe9,
+    0x4f, 0xbb, 0x2f, 0xa9, 0x4c, 0x75, 0x72, 0xee, 0xea, 0xf7, 0x8e, 0xf2,
+    0x18, 0xa3, 0x8d, 0x56, 0xc8, 0x83, 0x7c, 0x53, 0x38, 0xe1, 0xd0, 0x28,
+    0x7d, 0xc1, 0x65, 0x99, 0x39, 0x58, 0x36, 0xa3, 0x66, 0x71, 0x4c, 0x28,
+    0xcb, 0x9f, 0xb5, 0x58, 0x4b, 0xa3, 0x5c, 0x4e, 0xf9, 0x8d, 0x5b, 0x0c,
+    0xf1, 0x32, 0xbb, 0xe3, 0xb4, 0x47, 0xe8, 0x1c, 0x9e, 0xbb, 0x0a, 0x53,
+    0x3b, 0xb7, 0x51, 0xd6, 0x15, 0x77, 0x89, 0xf0, 0xb5, 0xba, 0x71, 0x84,
+    0x16, 0x81, 0xb0, 0xdf, 0x67, 0x12, 0x9f, 0xe7, 0x43, 0x70, 0x3a, 0xb1,
+    0xdc, 0x40, 0x31, 0xe7, 0xdd, 0x6b, 0x74, 0xfc, 0x18, 0x7d, 0x0d, 0xba,
+    0xda, 0x67, 0x66, 0x56, 0x43, 0x42, 0x80, 0xc6, 0x7c, 0xb3, 0x6c, 0x89,
+    0x2e, 0xc7, 0x0d, 0x97, 0x8a, 0xbe, 0x1a, 0x36, 0x05, 0x10, 0x85, 0x96,
+    0xa8, 0xbd, 0x29, 0x85, 0x52, 0xdc, 0xa3, 0x92, 0x20, 0xa1, 0xb0, 0x45,
+    0x5a, 0x7e, 0xc3, 0x4c, 0x0b, 0x6f, 0x3a, 0xe4, 0xfe, 0x55, 0x01, 0x49,
+    0x51, 0x06, 0xe7, 0xbb, 0x91, 0xd2, 0x77, 0x80, 0x1e, 0x07, 0xc7, 0xe8,
+    0x60, 0x32, 0x58, 0xe6, 0x22, 0xb6, 0x20, 0x91, 0x2a, 0xd6, 0x92, 0x9d,
+    0x96, 0x56, 0x25, 0x21, 0x5f, 0xea, 0xb5, 0xf5, 0xea, 0x17, 0x70, 0x92,
+    0x14, 0xfb, 0x88, 0xd6, 0x15, 0x4a, 0xda, 0x65, 0x64, 0x66, 0x79, 0x66,
+    0xcb, 0x38, 0x3e, 0xfc, 0xa1, 0xa0, 0x96, 0xf7, 0xb0, 0x4d, 0x87, 0x80,
+    0x05, 0x1e, 0x85, 0xd8, 0xb8, 0xf8, 0x50, 0x3e, 0x9d, 0xc1, 0x83, 0x81,
+    0x15, 0x59, 0x5d, 0x49, 0xd0, 0xed, 0x25, 0x2a, 0xf3, 0x59, 0xe4, 0xc6,
+    0x4b, 0xc2, 0x0f, 0x19, 0x92, 0x2f, 0x7f, 0x96, 0xd0, 0x90, 0x08, 0xef,
+    0x4f, 0x57, 0xa5, 0x3e, 0xec, 0xbe, 0xa5, 0x31, 0xd5, 0xcb, 0xbb, 0xab,
+    0xde, 0x3b, 0xc8, 0x62, 0x8e, 0x35, 0x5b, 0x22, 0x0d, 0xf1, 0x4c, 0xe3,
+    0x87, 0x40, 0xa1, 0xf7, 0x05, 0x96, 0x64, 0xe5, 0x60, 0xda, 0x8d, 0x99,
+    0xc5, 0x30, 0xa3, 0x2e, 0x7e, 0xd5, 0x61, 0x2e, 0x8d, 0x71, 0x3b, 0xe6,
+    0x35, 0x6c, 0x33, 0xc4, 0xca, 0xef, 0x8e, 0xd1, 0x1f, 0xa0, 0x72, 0x7a,
+    0xec, 0x29, 0x4c, 0xee, 0xdd, 0x47, 0x58, 0x55, 0xde, 0x27, 0xc2, 0xd6,
+    0xe9, 0xc6, 0x10, 0x5a, 0x06, 0xc3, 0x7d, 0x9c, 0x4a, 0x7f, 0x9d, 0x0d,
+    0xc0, 0xea, 0xc7, 0x71, 0x00, 0xc7, 0x9f, 0x75, 0xad, 0xd3, 0xf0, 0x61,
+    0xf4, 0x36, 0xeb, 0x69, 0x9d, 0x99, 0x59, 0x0d, 0x0a, 0x03, 0x19, 0xf2,
+    0xcd, 0xb2, 0x24, 0xbb, 0x1c, 0x36, 0x5e, 0x2a, 0xf8, 0x68, 0xd8, 0x14,
+    0x42, 0x16, 0x5a, 0xa2, 0xf4, 0xa6, 0x15, 0x4b, 0x72, 0x8e, 0x48, 0x82,
+    0x86, 0xc1, 0x15, 0x69, 0xfb, 0x0d, 0x30, 0x2d, 0xbc, 0xeb, 0x93, 0xf9,
+    0x54, 0x05, 0x25, 0x44, 0x1b, 0x9e, 0xee, 0x47, 0x49, 0xde, 0x00, 0x78,
+    0x1f, 0x1f, 0xa1, 0x80, 0xc9, 0x63, 0x98, 0x8a, 0xd8, 0x82, 0x44, 0xab,
+    0x5a, 0x4a, 0x76, 0x59, 0x58, 0x94, 0x85, 0x7f, 0xaa, 0xd7, 0xd7, 0xa8,
+    0x5d, 0xc2, 0x48, 0x53, 0xee, 0x23, 0x58, 0x55, 0x2b, 0x69, 0x95, 0x91,
+    0x99, 0xe5, 0x9b, 0x2c, 0xe0, 0xfb, 0xf2, 0x86, 0x82, 0x5b, 0xde, 0xc1,
+    0x36, 0x1e, 0x00, 0x14, 0x7a, 0x17, 0x62, 0xe3, 0xe1, 0x40, 0xfa, 0x77,
+    0x06, 0x0e, 0x04, 0x55, 0x65, 0x75, 0x27, 0x43, 0xb4, 0x94, 0xab, 0xcd,
+    0x67, 0x93, 0x19, 0x2f, 0x08, 0x3c, 0x66, 0x48, 0xbd, 0xfe, 0x5b, 0x42,
+    0x40, 0x23, 0xbd, 0x3d, 0x5e, 0x94, 0xfb, 0xb2, 0xfa, 0x94, 0xc7, 0x57,
+    0x2e, 0xee, 0xaf, 0x78, 0xef, 0x21, 0x8a, 0x38, 0xd5, 0x6c, 0x88, 0x37,
+    0xc5, 0x33, 0x8e, 0x1d, 0x02, 0x87, 0xdc, 0x16, 0x59, 0x93, 0x95, 0x83,
+    0x6a, 0x36, 0x67, 0x14, 0xc2, 0x8c, 0xb9, 0xfb, 0x55, 0x84, 0xba, 0x35,
+    0xc4, 0xef, 0x98, 0xd5, 0xb0, 0xcf, 0x13, 0x2b, 0xbe, 0x3b, 0x44, 0x7e,
+    0x81, 0xca,
+};
+static_assert(sizeof(kBytesTestReadSymbol13) == kNumBytesTestReadSymbol13, "");
+
+// The kBytesTestReadSymbol14[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][15] = {
+//   // pmf: 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14, 1/14,
+//   // 1/14, 1/14, 1/14
+//   { 32768 - 2341, 32768 - 4681, 32768 - 7022, 32768 - 9362, 32768 - 11703,
+//     32768 - 14043, 32768 - 16384, 32768 - 18725, 32768 - 21065,
+//     32768 - 23406, 32768 - 25746, 32768 - 28087, 32768 - 30427, 0, 0 },
+//   // pmf: 3/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 1/28
+//   { 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533, 32768 - 12873,
+//     32768 - 15214, 32768 - 17554, 32768 - 19895, 32768 - 22235,
+//     32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+//   // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 3/28
+//   { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+//     32768 - 12873, 32768 - 15214, 32768 - 17554, 32768 - 19895,
+//     32768 - 22235, 32768 - 24576, 32768 - 26917, 32768 - 29257, 0, 0 },
+//   // pmf: 1/28, 2/28, 2/28, 2/28, 2/28, 2/28, 3/28, 3/28, 2/28, 2/28, 2/28,
+//   // 2/28, 2/28, 1/28
+//   { 32768 - 1170, 32768 - 3511, 32768 - 5851, 32768 - 8192, 32768 - 10533,
+//     32768 - 12873, 32768 - 16384, 32768 - 19895, 32768 - 22235,
+//     32768 - 24576, 32768 - 26917, 32768 - 29257, 32768 - 31598, 0, 0 },
+// };
+// constexpr int kSymbols[28][4] = { { 0, 7, 13, 6 },    //
+//                                   { 1, 8, 12, 5 },    //
+//                                   { 2, 9, 11, 4 },    //
+//                                   { 3, 10, 10, 3 },   //
+//                                   { 4, 11, 9, 2 },    //
+//                                   { 5, 12, 8, 1 },    //
+//                                   { 6, 13, 7, 0 },    //
+//                                   { 7, 0, 6, 13 },    //
+//                                   { 8, 1, 5, 12 },    //
+//                                   { 9, 2, 4, 11 },    //
+//                                   { 10, 3, 3, 10 },   //
+//                                   { 11, 4, 2, 9 },    //
+//                                   { 12, 5, 1, 8 },    //
+//                                   { 13, 6, 0, 7 },    //
+//                                   { 0, 0, 13, 11 },   //
+//                                   { 2, 1, 12, 9 },    //
+//                                   { 4, 3, 10, 7 },    //
+//                                   { 6, 5, 8, 5 },     //
+//                                   { 8, 7, 6, 3 },     //
+//                                   { 10, 9, 4, 1 },    //
+//                                   { 12, 11, 2, 12 },  //
+//                                   { 1, 0, 13, 10 },   //
+//                                   { 3, 2, 11, 8 },    //
+//                                   { 5, 4, 9, 6 },     //
+//                                   { 7, 6, 7, 4 },     //
+//                                   { 9, 8, 5, 2 },     //
+//                                   { 11, 10, 3, 7 },   //
+//                                   { 13, 12, 1, 6 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 64; ++i) {
+//   for (int j = 0; j < 28; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 14);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol14 = 3455;
+constexpr uint8_t kBytesTestReadSymbol14[] = {
+    0x0a, 0xef, 0xeb, 0xb5, 0x78, 0x91, 0x0b, 0x9d, 0xee, 0x99, 0x14, 0x9c,
+    0xf4, 0x58, 0x86, 0xe8, 0x69, 0x7f, 0x06, 0x07, 0x60, 0xb0, 0x79, 0xbe,
+    0xea, 0xe5, 0x69, 0x1c, 0x67, 0x7a, 0x75, 0x91, 0x2f, 0x1d, 0x49, 0x4e,
+    0x15, 0x40, 0x56, 0x15, 0xa1, 0xff, 0x72, 0x2d, 0xa5, 0x40, 0x81, 0x21,
+    0x3d, 0x06, 0x78, 0xd2, 0x62, 0x8a, 0xf2, 0x63, 0x50, 0x9d, 0xbd, 0xa0,
+    0xd4, 0x14, 0x42, 0x76, 0x4f, 0x44, 0xbe, 0xb2, 0xa1, 0x0d, 0x4c, 0x75,
+    0xe4, 0x4a, 0xed, 0xf9, 0x7e, 0xb8, 0x7b, 0x5a, 0x26, 0x78, 0x5f, 0xe3,
+    0x86, 0x72, 0x64, 0x48, 0x76, 0x51, 0x7a, 0x77, 0x3b, 0xcf, 0xa2, 0x8d,
+    0x31, 0xec, 0xc1, 0xa7, 0xf9, 0x9a, 0x76, 0x00, 0x7c, 0x17, 0x40, 0x03,
+    0x12, 0xe8, 0xed, 0xbf, 0x39, 0xe2, 0xdd, 0x6d, 0xdc, 0xe2, 0x34, 0xdf,
+    0x0d, 0xa6, 0x86, 0x22, 0xca, 0x86, 0x5f, 0x57, 0x25, 0xc6, 0x57, 0x60,
+    0xc3, 0x06, 0xe9, 0xf0, 0x06, 0xd4, 0xc0, 0xb3, 0xfc, 0x5b, 0xcd, 0xa9,
+    0xc0, 0x51, 0x6e, 0x10, 0x0a, 0x5a, 0xfd, 0xbf, 0x92, 0xc8, 0x21, 0x0e,
+    0x83, 0x74, 0xfe, 0x01, 0xec, 0x24, 0x61, 0x9d, 0x9e, 0xb8, 0xb2, 0x04,
+    0xa7, 0xe9, 0xd6, 0xc7, 0x79, 0x5b, 0xaa, 0xdd, 0x94, 0x5d, 0x26, 0x61,
+    0x0b, 0xee, 0x66, 0xf4, 0xb2, 0xd1, 0x9b, 0xf0, 0xb4, 0x9b, 0x50, 0x4c,
+    0x4a, 0x57, 0xbc, 0xfe, 0x7e, 0xca, 0xfe, 0xa8, 0x22, 0x1b, 0x2f, 0x4a,
+    0x26, 0x32, 0x96, 0xfd, 0x03, 0x02, 0x1b, 0x7c, 0x1d, 0x6d, 0x42, 0x48,
+    0x2b, 0x11, 0x0d, 0x8f, 0x40, 0xb8, 0x15, 0xf1, 0xdd, 0x06, 0xf7, 0xa0,
+    0x1f, 0x0f, 0x75, 0xb1, 0x53, 0x73, 0x1f, 0xbf, 0x97, 0xf7, 0xa0, 0xcb,
+    0x5b, 0x98, 0xb7, 0x50, 0xa7, 0xc5, 0x23, 0x9b, 0x16, 0x0a, 0x2e, 0x03,
+    0x68, 0x3a, 0x92, 0x75, 0xb8, 0xb0, 0xd8, 0xda, 0x2e, 0x82, 0x61, 0x3f,
+    0xa0, 0x6e, 0x78, 0xe5, 0x7d, 0x14, 0xe5, 0x1f, 0x7b, 0xec, 0xb5, 0x14,
+    0xb7, 0xa0, 0x72, 0xdc, 0x1a, 0x23, 0xa4, 0x5b, 0xc5, 0xc2, 0x75, 0x6a,
+    0x7c, 0x36, 0xef, 0xf0, 0xd1, 0x5a, 0x34, 0x31, 0x0b, 0xae, 0x4c, 0x07,
+    0xc2, 0xb7, 0xab, 0xd5, 0x67, 0xed, 0x65, 0x5e, 0xa0, 0x7e, 0x16, 0x04,
+    0xc6, 0x1b, 0x74, 0x0f, 0xa9, 0x35, 0xe8, 0x71, 0x83, 0xca, 0xc3, 0x21,
+    0x74, 0xf5, 0xee, 0x71, 0xd1, 0x4c, 0xa2, 0x1d, 0xce, 0x16, 0x4b, 0x9b,
+    0xb0, 0x9f, 0x42, 0x08, 0x49, 0x6a, 0x82, 0x66, 0xe8, 0xb2, 0xce, 0xfd,
+    0x8e, 0xdb, 0x9e, 0x9e, 0xeb, 0x4b, 0x3d, 0xbb, 0xab, 0x61, 0xe4, 0x0d,
+    0x87, 0x8e, 0xe9, 0x7b, 0xe8, 0x57, 0x70, 0x8c, 0xab, 0x0c, 0x0f, 0x05,
+    0x4b, 0xca, 0x6d, 0xe7, 0x94, 0x2b, 0x29, 0x28, 0xfd, 0xfa, 0x11, 0x4c,
+    0x08, 0x51, 0xce, 0x45, 0x70, 0x87, 0x2b, 0xcf, 0x88, 0x80, 0x87, 0x38,
+    0x80, 0x5d, 0x2e, 0x8f, 0x47, 0xd8, 0x5e, 0x75, 0x66, 0xa7, 0x86, 0x5e,
+    0x98, 0xd4, 0x1b, 0x00, 0x11, 0xcf, 0x7b, 0xef, 0x8b, 0x17, 0x93, 0xe0,
+    0x3a, 0x90, 0x7d, 0x0b, 0x45, 0x34, 0x2a, 0x67, 0xa4, 0x0e, 0xab, 0xc3,
+    0x3b, 0x27, 0x68, 0x03, 0x4d, 0xcb, 0xd5, 0x87, 0x53, 0x37, 0xe5, 0xcc,
+    0xc3, 0x73, 0x4a, 0x2c, 0x5f, 0xdc, 0x8d, 0xba, 0x6c, 0x11, 0xa0, 0x35,
+    0xc6, 0xbe, 0xd9, 0xd6, 0x64, 0x2e, 0x4b, 0x85, 0xbf, 0x50, 0xdd, 0xa6,
+    0xa0, 0xa4, 0x23, 0xd7, 0x82, 0xb6, 0x65, 0x4e, 0xa8, 0xd4, 0x19, 0xa1,
+    0xe4, 0xc8, 0x4d, 0x69, 0x2a, 0x41, 0x4f, 0x1e, 0x46, 0xb1, 0xde, 0x64,
+    0x0b, 0xf8, 0x62, 0xfe, 0x27, 0xc5, 0x2e, 0x31, 0x0f, 0x40, 0xae, 0x64,
+    0x86, 0x2a, 0x36, 0x7e, 0x03, 0x01, 0x37, 0xf3, 0x36, 0x42, 0x3f, 0xaa,
+    0x0b, 0xdd, 0xa9, 0x3e, 0x09, 0xe2, 0xe9, 0xea, 0x15, 0x5b, 0x0d, 0x4b,
+    0xcc, 0x47, 0xa5, 0x24, 0xed, 0x0b, 0x3c, 0xb3, 0x6e, 0xc6, 0x1d, 0x47,
+    0x39, 0x30, 0xe6, 0xf6, 0xc7, 0xae, 0x6b, 0x25, 0x09, 0xce, 0xf2, 0x2f,
+    0xaf, 0x4d, 0x32, 0xac, 0x4f, 0xa4, 0xff, 0x39, 0x48, 0xbb, 0xe6, 0xdf,
+    0x93, 0x41, 0x00, 0x2a, 0x82, 0xd9, 0x81, 0x79, 0xc4, 0x65, 0xf3, 0x62,
+    0x17, 0x18, 0x37, 0xcf, 0xa0, 0xaa, 0xe5, 0xc6, 0x97, 0x84, 0x14, 0x1c,
+    0x7e, 0x36, 0x72, 0xe2, 0x35, 0x84, 0x39, 0x43, 0x7b, 0xbf, 0xaf, 0x94,
+    0x9a, 0xa2, 0xeb, 0xf9, 0xc4, 0x5c, 0x49, 0x5a, 0xef, 0x6b, 0xe6, 0x19,
+    0x0e, 0xac, 0x08, 0x43, 0x4d, 0x5a, 0x14, 0x7e, 0x27, 0x4a, 0xd1, 0x4a,
+    0x9b, 0x3f, 0xdc, 0x98, 0x5a, 0xcb, 0x40, 0x90, 0xdf, 0x56, 0xa1, 0x76,
+    0x12, 0x71, 0xe1, 0x20, 0x5e, 0xf1, 0xaa, 0xd7, 0xba, 0x6c, 0xfb, 0x1d,
+    0x20, 0xfe, 0xa0, 0x41, 0x65, 0x09, 0x5f, 0x8b, 0xde, 0x20, 0xb7, 0x26,
+    0xd5, 0xce, 0x83, 0x14, 0x0d, 0x28, 0x36, 0x86, 0xe1, 0x02, 0x86, 0xde,
+    0xf3, 0xc6, 0x44, 0x10, 0x04, 0x84, 0x9f, 0x18, 0x9b, 0xf1, 0x0a, 0xca,
+    0x41, 0x53, 0xa9, 0xa9, 0x6b, 0xa5, 0x95, 0x22, 0x1d, 0x17, 0x3b, 0xc0,
+    0x5f, 0xb7, 0x5e, 0xac, 0x73, 0x4e, 0x76, 0xaf, 0x4c, 0xb4, 0x4f, 0xf6,
+    0x3f, 0xa1, 0x20, 0x2e, 0xf7, 0xa8, 0x14, 0x0d, 0xc3, 0x50, 0x97, 0x25,
+    0xe0, 0xc4, 0x5c, 0x3e, 0xe6, 0xbe, 0xe9, 0xa4, 0x1e, 0x1d, 0xdb, 0x06,
+    0xc1, 0x15, 0xf2, 0x6d, 0xbf, 0x71, 0xf2, 0x0b, 0xd9, 0x75, 0x4b, 0x38,
+    0xf5, 0xe2, 0x69, 0x0d, 0x93, 0xa5, 0x8e, 0x4c, 0xc5, 0x2a, 0xb6, 0x45,
+    0x60, 0x77, 0xd6, 0x14, 0x39, 0x5e, 0x70, 0x9e, 0x8d, 0x07, 0x20, 0x1c,
+    0x05, 0xc9, 0xb0, 0x46, 0xf7, 0x6c, 0x3e, 0xf8, 0xf8, 0x0a, 0xad, 0x0b,
+    0x22, 0x5e, 0x32, 0xbd, 0x46, 0xbc, 0x06, 0x7b, 0x92, 0x36, 0x5a, 0x2b,
+    0xac, 0x68, 0x2d, 0x5a, 0xf4, 0xc2, 0x61, 0xe3, 0x9d, 0xf4, 0x5d, 0x59,
+    0x59, 0x98, 0xb7, 0x5a, 0x73, 0x08, 0xf6, 0x4f, 0x0a, 0x75, 0x04, 0x93,
+    0xc1, 0xe1, 0x9b, 0xe0, 0xb0, 0x2a, 0xf7, 0xdd, 0x8b, 0xae, 0xf5, 0x55,
+    0x28, 0x6b, 0x21, 0x9b, 0x02, 0x43, 0xbd, 0x36, 0x4d, 0xa5, 0x17, 0xbb,
+    0x97, 0xd4, 0x78, 0x1f, 0xe8, 0xd9, 0x98, 0x0e, 0x41, 0x96, 0x52, 0xab,
+    0xad, 0x91, 0x92, 0xae, 0x62, 0x5c, 0xe7, 0xeb, 0x24, 0x1b, 0xe8, 0x2a,
+    0xb2, 0xe8, 0xdc, 0x34, 0x7f, 0xe9, 0xa1, 0x4c, 0x4c, 0x13, 0xeb, 0x31,
+    0x29, 0xc3, 0xc4, 0xf5, 0xb4, 0x50, 0xb1, 0x8b, 0x08, 0xc3, 0x30, 0xf8,
+    0x40, 0xd8, 0x76, 0xd5, 0x4d, 0xf0, 0xc2, 0xd8, 0x67, 0x75, 0x01, 0x81,
+    0x2a, 0xe0, 0x6b, 0xc0, 0xf5, 0x30, 0x55, 0xb6, 0xa9, 0x52, 0x19, 0xc4,
+    0x73, 0x78, 0xc4, 0x9e, 0x13, 0x5f, 0xa7, 0x56, 0xb4, 0x07, 0x2c, 0x92,
+    0x85, 0x66, 0x5d, 0x00, 0x47, 0x32, 0x3c, 0x8b, 0xbf, 0x86, 0x9e, 0xe2,
+    0xfd, 0xf1, 0xf0, 0x15, 0x5a, 0x16, 0x44, 0xbc, 0x65, 0x7a, 0x8d, 0x78,
+    0x0c, 0xf9, 0x94, 0x1d, 0x83, 0x7c, 0xee, 0xc7, 0x71, 0x23, 0x42, 0x2d,
+    0xb3, 0xe4, 0x68, 0x31, 0xec, 0x17, 0x63, 0x27, 0xe3, 0x52, 0x9d, 0xd0,
+    0xcd, 0xd8, 0xd8, 0x86, 0xb4, 0x91, 0x8a, 0xa3, 0xcb, 0xa3, 0x76, 0xc7,
+    0x98, 0xda, 0xd6, 0xb8, 0x34, 0x1c, 0xf6, 0x72, 0x23, 0xd8, 0x1b, 0xbe,
+    0x2d, 0x05, 0xe1, 0x83, 0x01, 0x74, 0xc7, 0xe3, 0x54, 0x85, 0xec, 0xec,
+    0xfb, 0x3a, 0xa2, 0xf3, 0x21, 0x7a, 0x0b, 0x68, 0x91, 0x02, 0xd2, 0xa4,
+    0x40, 0x21, 0xef, 0x4f, 0xe5, 0x3d, 0x6d, 0x6e, 0xfb, 0xba, 0xb1, 0x90,
+    0x4f, 0x81, 0x07, 0x27, 0x5e, 0xa8, 0xab, 0xa8, 0x87, 0x38, 0x3c, 0xe5,
+    0x48, 0x29, 0x9e, 0x77, 0x4c, 0xb4, 0x9d, 0x91, 0x2d, 0x8a, 0x0a, 0x84,
+    0xdd, 0x93, 0x95, 0xdf, 0xd4, 0xa3, 0x8f, 0xb7, 0xaf, 0x07, 0xd3, 0x81,
+    0xbb, 0x0d, 0x89, 0x42, 0x92, 0x0b, 0x66, 0x39, 0x8b, 0x99, 0x36, 0x61,
+    0xbb, 0xe1, 0x05, 0xca, 0x68, 0xc8, 0x0f, 0xae, 0x9e, 0x7d, 0x75, 0x7f,
+    0x24, 0xef, 0xdc, 0x97, 0x8d, 0xb9, 0xa5, 0x7a, 0x3c, 0xc4, 0x49, 0x79,
+    0x47, 0x47, 0x61, 0x88, 0xaf, 0x96, 0x08, 0x11, 0x22, 0xff, 0xb7, 0x14,
+    0x12, 0x15, 0x14, 0x26, 0xa3, 0x03, 0x0e, 0xb2, 0xff, 0x57, 0x9e, 0xc0,
+    0x92, 0x4f, 0x4c, 0x69, 0xd4, 0xfe, 0xc1, 0x46, 0xc4, 0xe8, 0x64, 0x7f,
+    0x08, 0x38, 0x90, 0x15, 0x8f, 0xc2, 0xc8, 0xa8, 0x50, 0x7f, 0x74, 0x4a,
+    0xc3, 0x37, 0x52, 0x44, 0x25, 0x78, 0x19, 0x48, 0x00, 0xd1, 0x39, 0x43,
+    0x3a, 0x14, 0x72, 0x8c, 0x8e, 0xa2, 0xf8, 0x95, 0x1e, 0x56, 0x07, 0xdd,
+    0xcd, 0x89, 0xde, 0x71, 0xc3, 0x85, 0xc3, 0xcf, 0xe4, 0x6c, 0xf4, 0x43,
+    0x95, 0x49, 0x27, 0x25, 0x35, 0x1a, 0xb9, 0xf7, 0xc8, 0x20, 0xeb, 0x01,
+    0xbb, 0x49, 0x8d, 0xf4, 0xc0, 0x32, 0xbe, 0x74, 0x42, 0x07, 0x53, 0xd0,
+    0xf4, 0x4c, 0x79, 0xa8, 0xb7, 0xf9, 0x09, 0xfd, 0xeb, 0x02, 0x83, 0x26,
+    0x3b, 0x88, 0x1a, 0x41, 0x70, 0x95, 0x2f, 0x53, 0xc1, 0xc1, 0xa5, 0xbe,
+    0x23, 0x32, 0x8b, 0x48, 0xb8, 0xff, 0x4c, 0x6b, 0x6e, 0xbf, 0xd7, 0xe0,
+    0xf1, 0x3a, 0xfd, 0xd2, 0x1e, 0xa2, 0x11, 0x50, 0xa0, 0xfe, 0xd2, 0x3d,
+    0x20, 0xa6, 0x79, 0xdd, 0x32, 0xd2, 0x76, 0x44, 0xb6, 0x28, 0x2a, 0x13,
+    0x76, 0x4e, 0x57, 0x92, 0xa5, 0x01, 0x64, 0x30, 0x06, 0xf1, 0xba, 0x62,
+    0x5a, 0x59, 0xab, 0xf2, 0x15, 0xef, 0x3c, 0x24, 0x96, 0x14, 0x6f, 0xd4,
+    0x51, 0xee, 0x6d, 0xeb, 0x77, 0xad, 0xba, 0x03, 0xe0, 0xd2, 0x30, 0xbd,
+    0xbf, 0x06, 0x14, 0xa3, 0xad, 0xd7, 0x97, 0x20, 0x89, 0x63, 0x8f, 0x84,
+    0x0d, 0x87, 0x6d, 0x5b, 0xdf, 0x0c, 0x2d, 0x86, 0x77, 0x6b, 0x73, 0xd6,
+    0x34, 0x83, 0xe5, 0x15, 0x88, 0x3e, 0xbc, 0x4d, 0x2c, 0x96, 0xd1, 0x1a,
+    0x81, 0xf1, 0xb4, 0x6c, 0xaa, 0x52, 0x3a, 0x53, 0x52, 0xc6, 0x73, 0x1b,
+    0xe6, 0xaa, 0xd5, 0xc8, 0x91, 0xee, 0x72, 0xad, 0x66, 0x25, 0x61, 0xbd,
+    0xa7, 0x15, 0x46, 0x5d, 0x76, 0x4a, 0x47, 0x9b, 0x03, 0x44, 0xe5, 0x0c,
+    0xe8, 0x51, 0xca, 0x32, 0x3a, 0x8b, 0xe2, 0x54, 0x79, 0x4d, 0x51, 0x4e,
+    0xbb, 0x44, 0x2c, 0x30, 0xd1, 0xe6, 0xa1, 0xc9, 0x2c, 0x28, 0xdf, 0xa8,
+    0xa3, 0xdc, 0xdb, 0xd6, 0xef, 0x5b, 0x74, 0x07, 0xc1, 0xa4, 0x55, 0x37,
+    0xc6, 0xfc, 0xde, 0xf2, 0x35, 0xb3, 0xf2, 0x3f, 0xe8, 0x0c, 0xbe, 0x60,
+    0x72, 0x56, 0xde, 0x5f, 0x0d, 0xdd, 0x2e, 0x67, 0x63, 0x31, 0x23, 0xbc,
+    0xbe, 0x8d, 0x47, 0xdd, 0xa0, 0x38, 0xab, 0x04, 0xd7, 0xb7, 0x07, 0xf9,
+    0x5d, 0x5e, 0x27, 0xd0, 0x6e, 0xda, 0x01, 0xda, 0x8b, 0x3d, 0xe9, 0x89,
+    0xe4, 0xbb, 0xeb, 0x3d, 0xd2, 0xb1, 0x16, 0x16, 0xe6, 0x49, 0xb6, 0x28,
+    0x02, 0xc3, 0xd0, 0x57, 0x17, 0x4f, 0x2a, 0x9b, 0x42, 0x74, 0x1d, 0x38,
+    0xc4, 0x19, 0xdd, 0xad, 0xcf, 0x58, 0xd2, 0x0f, 0x94, 0x56, 0x20, 0xfa,
+    0xf1, 0x34, 0xb2, 0x5b, 0x44, 0x6a, 0x07, 0xc6, 0xd1, 0xb2, 0xa9, 0x48,
+    0xe9, 0x4d, 0x4b, 0x19, 0xcc, 0x6f, 0x9a, 0xab, 0x57, 0x22, 0x47, 0xb9,
+    0xca, 0xb5, 0x98, 0x88, 0x58, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea,
+    0x45, 0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e,
+    0x6a, 0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d,
+    0xe5, 0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe,
+    0x79, 0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5,
+    0x76, 0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4,
+    0x4f, 0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8,
+    0xd4, 0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00,
+    0xd6, 0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27,
+    0x5e, 0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d,
+    0xae, 0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6,
+    0x28, 0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f,
+    0xdd, 0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc,
+    0xa8, 0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57,
+    0x84, 0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf,
+    0x2e, 0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09,
+    0x68, 0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37,
+    0xc7, 0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8,
+    0x62, 0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b,
+    0xcf, 0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37,
+    0x5e, 0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1,
+    0x79, 0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f,
+    0x26, 0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3,
+    0xc3, 0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a,
+    0x93, 0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8,
+    0x64, 0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea,
+    0x25, 0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1,
+    0x67, 0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4,
+    0x5a, 0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6,
+    0xa9, 0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde,
+    0x55, 0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7,
+    0x9b, 0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57,
+    0x63, 0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44,
+    0xf5, 0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d,
+    0x4e, 0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d,
+    0x60, 0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75,
+    0xe8, 0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda,
+    0xec, 0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62,
+    0x84, 0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd,
+    0xd0, 0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca,
+    0x8b, 0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78,
+    0x4d, 0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2,
+    0xe1, 0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96,
+    0x8b, 0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c,
+    0x7b, 0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86,
+    0x21, 0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc,
+    0xfe, 0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75,
+    0xe7, 0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17,
+    0x98, 0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2,
+    0x6d, 0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c,
+    0x36, 0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9,
+    0x34, 0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86,
+    0x4f, 0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2,
+    0x5a, 0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16,
+    0x7d, 0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45,
+    0xad, 0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a,
+    0x9d, 0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5,
+    0x5b, 0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79,
+    0xb0, 0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76,
+    0x3c, 0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f,
+    0x56, 0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4,
+    0xe3, 0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6,
+    0x02, 0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e,
+    0x89, 0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae,
+    0xc8, 0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28,
+    0x45, 0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd,
+    0x0b, 0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8,
+    0xbc, 0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84,
+    0xdd, 0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e,
+    0x14, 0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68,
+    0xb7, 0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7,
+    0xb2, 0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62,
+    0x13, 0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf,
+    0xef, 0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e,
+    0x78, 0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79,
+    0x83, 0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26,
+    0xd2, 0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3,
+    0x62, 0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93,
+    0x4c, 0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64,
+    0xfa, 0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25,
+    0xa2, 0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67,
+    0xdd, 0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a,
+    0xdc, 0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9,
+    0xdb, 0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55,
+    0xb0, 0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b,
+    0x04, 0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63,
+    0xc3, 0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5,
+    0x63, 0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e,
+    0x34, 0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60,
+    0x2b, 0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8,
+    0x96, 0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec,
+    0x84, 0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84,
+    0x5d, 0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0,
+    0xb2, 0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b,
+    0xcf, 0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d,
+    0xde, 0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1,
+    0x4b, 0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b,
+    0x79, 0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b,
+    0x2c, 0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21,
+    0x39, 0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe,
+    0xf8, 0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7,
+    0x87, 0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98,
+    0x3f, 0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d,
+    0x23, 0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36,
+    0x2b, 0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34,
+    0xc0, 0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f,
+    0xaf, 0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x5a,
+    0x29, 0x37, 0xa6, 0x3f, 0x2a, 0x2f, 0x3c, 0xb0, 0x60, 0x8a, 0x16, 0x7d,
+    0xda, 0xe0, 0xd8, 0x15, 0xe1, 0x37, 0x7b, 0x18, 0xdc, 0xea, 0x45, 0xad,
+    0xc7, 0xc3, 0xb4, 0xeb, 0xcb, 0x85, 0x2c, 0x31, 0xa6, 0x5e, 0x6a, 0x9d,
+    0xb6, 0x45, 0x19, 0x42, 0x5a, 0x2d, 0xe7, 0x15, 0x99, 0x8d, 0xe5, 0x5b,
+    0x09, 0x52, 0x8e, 0x4d, 0xf1, 0xec, 0xb3, 0xb1, 0xf5, 0xfe, 0x79, 0xb0,
+    0x4a, 0x4f, 0xb6, 0xbe, 0x18, 0x84, 0xe6, 0xaa, 0xb0, 0xe5, 0x76, 0x3c,
+    0x35, 0x51, 0xd2, 0xa6, 0xf3, 0xfb, 0xe3, 0x1b, 0xf5, 0xc4, 0x4f, 0x56,
+    0x3a, 0xc7, 0x41, 0x8d, 0xd7, 0x9e, 0x1e, 0xc9, 0x9c, 0xd8, 0xd4, 0xe3,
+    0x4f, 0xb5, 0xfd, 0x78, 0x5e, 0x60, 0xff, 0xd3, 0xdc, 0x00, 0xd6, 0x02,
+    0xba, 0x09, 0x8b, 0x93, 0xc9, 0xb4, 0x8e, 0x4e, 0x21, 0x27, 0x5e, 0x89,
+    0x6c, 0x31, 0x79, 0xfc, 0xf0, 0xd8, 0xac, 0x48, 0x52, 0x7d, 0xae, 0xc8,
+    0x4b, 0xef, 0x06, 0xde, 0xa4, 0xd3, 0x01, 0x46, 0xb2, 0xd6, 0x28, 0x45,
+    0xd9, 0xcb, 0x63, 0x32, 0x19, 0x3e, 0xbf, 0x13, 0x99, 0x7f, 0xdd, 0x0b,
+    0x25, 0x72, 0x57, 0x7a, 0x89, 0x68, 0xa4, 0xde, 0x98, 0xfc, 0xa8, 0xbc,
+    0xf2, 0xc1, 0x82, 0x28, 0x59, 0xf7, 0x6b, 0x83, 0x60, 0x57, 0x84, 0xdd,
+    0xec, 0x63, 0x73, 0xa9, 0x16, 0xb7, 0x1f, 0x0e, 0xd3, 0xaf, 0x2e, 0x14,
+    0xb0, 0xc6, 0x99, 0x79, 0xaa, 0x76, 0xd9, 0x14, 0x65, 0x09, 0x68, 0xb7,
+    0x9c, 0x56, 0x66, 0x37, 0x95, 0x6c, 0x25, 0x4a, 0x39, 0x37, 0xc7, 0xb2,
+    0xce, 0xc7, 0xd7, 0xf9, 0xe6, 0xc1, 0x29, 0x3e, 0xda, 0xf8, 0x62, 0x13,
+    0x9a, 0xaa, 0xc3, 0x95, 0xd8, 0xf0, 0xd5, 0x47, 0x4a, 0x9b, 0xcf, 0xef,
+    0x8c, 0x6f, 0xd7, 0x11, 0x3d, 0x58, 0xeb, 0x1d, 0x06, 0x37, 0x5e, 0x78,
+    0x7b, 0x26, 0x73, 0x63, 0x53, 0x8d, 0x3e, 0xd7, 0xf5, 0xe1, 0x79, 0x83,
+    0xff, 0x4f, 0x70, 0x03, 0x58, 0x0a, 0xe8, 0x26, 0x2e, 0x4f, 0x26, 0xd2,
+    0x39, 0x38, 0x84, 0x9d, 0x7a, 0x25, 0xb0, 0xc5, 0xe7, 0xf3, 0xc3, 0x62,
+    0xb1, 0x21, 0x49, 0xf6, 0xbb, 0x21, 0x2f, 0xbc, 0x1b, 0x7a, 0x93, 0x4c,
+    0x05, 0x1a, 0xcb, 0x58, 0xa1, 0x17, 0x67, 0x2d, 0x8c, 0xc8, 0x64, 0xfa,
+    0xfc, 0x4e, 0x65, 0xff, 0x74, 0x2c, 0x95, 0xc9, 0x5d, 0xea, 0x25, 0xa2,
+    0x93, 0x7a, 0x63, 0xf2, 0xa2, 0xf3, 0xcb, 0x06, 0x08, 0xa1, 0x67, 0xdd,
+    0xae, 0x0d, 0x81, 0x5e, 0x13, 0x77, 0xb1, 0x8d, 0xce, 0xa4, 0x5a, 0xdc,
+    0x7c, 0x3b, 0x4e, 0xbc, 0xb8, 0x52, 0xc3, 0x1a, 0x65, 0xe6, 0xa9, 0xdb,
+    0x64, 0x51, 0x94, 0x25, 0xa2, 0xde, 0x71, 0x59, 0x98, 0xde, 0x55, 0xb0,
+    0x95, 0x28, 0xe4, 0xdf, 0x1e, 0xcb, 0x3b, 0x1f, 0x5f, 0xe7, 0x9b, 0x04,
+    0xa4, 0xfb, 0x6b, 0xe1, 0x88, 0x4e, 0x6a, 0xab, 0x0e, 0x57, 0x63, 0xc3,
+    0x55, 0x1d, 0x2a, 0x6f, 0x3f, 0xbe, 0x31, 0xbf, 0x5c, 0x44, 0xf5, 0x63,
+    0xac, 0x74, 0x18, 0xdd, 0x79, 0xe1, 0xec, 0x99, 0xcd, 0x8d, 0x4e, 0x34,
+    0xfb, 0x5f, 0xd7, 0x85, 0xe6, 0x0f, 0xfd, 0x3d, 0xc0, 0x0d, 0x60, 0x2b,
+    0xa0, 0x98, 0xb9, 0x3c, 0x9b, 0x48, 0xe4, 0xe2, 0x12, 0x75, 0xe8, 0x96,
+    0xc3, 0x17, 0x9f, 0xcf, 0x0d, 0x8a, 0xc4, 0x85, 0x27, 0xda, 0xec, 0x84,
+    0xbe, 0xf0, 0x6d, 0xea, 0x4d, 0x30, 0x14, 0x6b, 0x2d, 0x62, 0x84, 0x5d,
+    0x9c, 0xb6, 0x33, 0x21, 0x93, 0xeb, 0xf1, 0x39, 0x97, 0xfd, 0xd0, 0xb2,
+    0x57, 0x25, 0x77, 0xa8, 0x96, 0x8a, 0x4d, 0xe9, 0x8f, 0xca, 0x8b, 0xcf,
+    0x2c, 0x18, 0x22, 0x85, 0x9f, 0x76, 0xb8, 0x36, 0x05, 0x78, 0x4d, 0xde,
+    0xc6, 0x37, 0x3a, 0x91, 0x6b, 0x71, 0xf0, 0xed, 0x3a, 0xf2, 0xe1, 0x4b,
+    0x0c, 0x69, 0x97, 0x9a, 0xa7, 0x6d, 0x91, 0x46, 0x50, 0x96, 0x8b, 0x79,
+    0xc5, 0x66, 0x63, 0x79, 0x56, 0xc2, 0x54, 0xa3, 0x93, 0x7c, 0x7b, 0x2c,
+    0xec, 0x7d, 0x7f, 0x9e, 0x6c, 0x12, 0x93, 0xed, 0xaf, 0x86, 0x21, 0x39,
+    0xaa, 0xac, 0x39, 0x5d, 0x8f, 0x0d, 0x54, 0x74, 0xa9, 0xbc, 0xfe, 0xf8,
+    0xc6, 0xfd, 0x71, 0x13, 0xd5, 0x8e, 0xb1, 0xd0, 0x63, 0x75, 0xe7, 0x87,
+    0xb2, 0x67, 0x36, 0x35, 0x38, 0xd3, 0xed, 0x7f, 0x5e, 0x17, 0x98, 0x3f,
+    0xf4, 0xf7, 0x00, 0x35, 0x80, 0xae, 0x82, 0x62, 0xe4, 0xf2, 0x6d, 0x23,
+    0x93, 0x88, 0x49, 0xd7, 0xa2, 0x5b, 0x0c, 0x5e, 0x7f, 0x3c, 0x36, 0x2b,
+    0x12, 0x14, 0x9f, 0x6b, 0xb2, 0x12, 0xfb, 0xc1, 0xb7, 0xa9, 0x34, 0xc0,
+    0x51, 0xac, 0xb5, 0x8a, 0x11, 0x76, 0x72, 0xd8, 0xcc, 0x86, 0x4f, 0xaf,
+    0xc4, 0xe6, 0x5f, 0xf7, 0x42, 0xc9, 0x5c, 0x95, 0xde, 0xa2, 0x70,
+};
+static_assert(sizeof(kBytesTestReadSymbol14) == kNumBytesTestReadSymbol14, "");
+
+// The kBytesTestReadSymbol16[] array was encoded by using the following libaom
+// code:
+//
+// aom_cdf_prob cdf[4][17] = {
+//   // pmf: 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16, 1/16,
+//   // 1/16, 1/16, 1/16, 1/16, 1/16
+//   { 32768 - 2048, 32768 - 4096, 32768 - 6144, 32768 - 8192, 32768 - 10240,
+//     32768 - 12288, 32768 - 14336, 32768 - 16384, 32768 - 18432,
+//     32768 - 20480, 32768 - 22528, 32768 - 24576, 32768 - 26624,
+//     32768 - 28672, 32768 - 30720, 0, 0 },
+//   // pmf: 3/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 1/32
+//   { 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216, 32768 - 11264,
+//     32768 - 13312, 32768 - 15360, 32768 - 17408, 32768 - 19456,
+//     32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+//     32768 - 29696, 32768 - 31744, 0, 0 },
+//   // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 3/32
+//   { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+//     32768 - 11264, 32768 - 13312, 32768 - 15360, 32768 - 17408,
+//     32768 - 19456, 32768 - 21504, 32768 - 23552, 32768 - 25600,
+//     32768 - 27648, 32768 - 29696, 0, 0 },
+//   // pmf: 1/32, 2/32, 2/32, 2/32, 2/32, 2/32, 2/32, 3/32, 3/32, 2/32, 2/32,
+//   // 2/32, 2/32, 2/32, 2/32, 1/32
+//   { 32768 - 1024, 32768 - 3072, 32768 - 5120, 32768 - 7168, 32768 - 9216,
+//     32768 - 11264, 32768 - 13312, 32768 - 16384, 32768 - 19456,
+//     32768 - 21504, 32768 - 23552, 32768 - 25600, 32768 - 27648,
+//     32768 - 29696, 32768 - 31744, 0, 0 },
+// };
+// constexpr int kSymbols[32][4] = { { 0, 8, 15, 7 },    //
+//                                   { 1, 9, 14, 6 },    //
+//                                   { 2, 10, 13, 5 },   //
+//                                   { 3, 11, 12, 4 },   //
+//                                   { 4, 12, 11, 3 },   //
+//                                   { 5, 13, 10, 2 },   //
+//                                   { 6, 14, 9, 1 },    //
+//                                   { 7, 15, 8, 0 },    //
+//                                   { 8, 0, 7, 15 },    //
+//                                   { 9, 1, 6, 14 },    //
+//                                   { 10, 2, 5, 13 },   //
+//                                   { 11, 3, 4, 12 },   //
+//                                   { 12, 4, 3, 11 },   //
+//                                   { 13, 5, 2, 10 },   //
+//                                   { 14, 6, 1, 9 },    //
+//                                   { 15, 7, 0, 8 },    //
+//                                   { 0, 0, 15, 13 },   //
+//                                   { 2, 1, 14, 11 },   //
+//                                   { 4, 3, 12, 9 },    //
+//                                   { 6, 5, 10, 7 },    //
+//                                   { 8, 7, 8, 5 },     //
+//                                   { 10, 9, 6, 3 },    //
+//                                   { 12, 11, 4, 1 },   //
+//                                   { 14, 13, 2, 14 },  //
+//                                   { 1, 0, 15, 12 },   //
+//                                   { 3, 2, 13, 10 },   //
+//                                   { 5, 4, 11, 8 },    //
+//                                   { 7, 6, 9, 6 },     //
+//                                   { 9, 8, 7, 4 },     //
+//                                   { 11, 10, 5, 2 },   //
+//                                   { 13, 12, 3, 8 },   //
+//                                   { 15, 14, 1, 7 } };
+// const unsigned int kBufferSize = 65536;
+// uint8_t bw_buffer[kBufferSize];
+// aom_writer bw;
+// bw.allow_update_cdf = 1;
+// aom_start_encode(&bw, bw_buffer);
+// for (int i = 0; i < 48; ++i) {
+//   for (int j = 0; j < 32; ++j) {
+//     for (int k = 0; k < 4; ++k) {
+//       aom_write_symbol(&bw, kSymbols[j][k], cdf[k], 16);
+//     }
+//   }
+// }
+// aom_stop_encode(&bw);
+// printf("constexpr size_t kNumBytes = %u;\n", bw.pos);
+// printf("constexpr uint8_t kBytes[] = {");
+// int count = 0;
+// for (unsigned int i = 0; i < bw.pos; ++i) {
+//   if (count++ % 12 == 0) {
+//     printf("\n    ");
+//   } else {
+//     printf(" ");
+//   }
+//   printf("0x%02x,", bw_buffer[i]);
+// }
+// printf("\n};\n");
+
+constexpr size_t kNumBytesTestReadSymbol16 = 3120;
+constexpr uint8_t kBytesTestReadSymbol16[] = {
+    0x09, 0x2c, 0xb8, 0x5a, 0xe4, 0xe6, 0xc6, 0x1f, 0x3e, 0xa7, 0x50, 0xbf,
+    0x19, 0x26, 0xbf, 0x20, 0xc3, 0xa2, 0x08, 0xdf, 0x44, 0xd9, 0x4d, 0x8c,
+    0xf7, 0xbf, 0x6b, 0x6d, 0x22, 0x97, 0x8e, 0xd7, 0x93, 0xad, 0x33, 0xe3,
+    0x7f, 0x5b, 0x71, 0x03, 0x6b, 0x4e, 0xbf, 0xf5, 0x38, 0xbe, 0xba, 0x6c,
+    0x0d, 0x28, 0xca, 0x74, 0x2d, 0x1d, 0x3f, 0x91, 0xad, 0x7e, 0x98, 0x5c,
+    0xa7, 0x39, 0x5e, 0x7c, 0x43, 0x2b, 0x88, 0xb2, 0x81, 0x91, 0xad, 0x62,
+    0x14, 0xc6, 0x0a, 0x81, 0x15, 0x1f, 0x4e, 0xd5, 0xc1, 0x5c, 0x43, 0x35,
+    0xc3, 0xe6, 0x3d, 0xaa, 0xc3, 0xb5, 0x95, 0x01, 0xbd, 0x2d, 0x21, 0x04,
+    0x14, 0x79, 0x7a, 0x02, 0x7e, 0xb8, 0x09, 0x20, 0x06, 0x82, 0xc8, 0x6f,
+    0x29, 0x2c, 0xb2, 0x9b, 0xe2, 0x8d, 0xf5, 0x56, 0xf5, 0x64, 0xf4, 0xd7,
+    0xfe, 0x24, 0x29, 0xb6, 0x35, 0x16, 0x08, 0x26, 0xc0, 0xf0, 0xfd, 0x33,
+    0x04, 0x6f, 0x70, 0x85, 0x3a, 0xac, 0x8f, 0xab, 0x48, 0xce, 0x04, 0xc1,
+    0x0a, 0x4c, 0xb6, 0xaa, 0x83, 0x39, 0xc1, 0xf6, 0x00, 0xb8, 0x56, 0x4e,
+    0xa2, 0xd1, 0x19, 0x70, 0x6a, 0x2b, 0x86, 0xef, 0xbd, 0x11, 0x27, 0x54,
+    0x52, 0x01, 0xa2, 0x3f, 0x53, 0x0e, 0x5b, 0x23, 0x3c, 0x90, 0x82, 0xaf,
+    0x9d, 0x79, 0xb5, 0x5e, 0x7e, 0x2e, 0x6e, 0xad, 0x3d, 0xe9, 0x3a, 0xff,
+    0xd7, 0x59, 0x40, 0xa3, 0x56, 0xa9, 0x5e, 0x52, 0xda, 0x04, 0x74, 0x09,
+    0x47, 0x7c, 0x6c, 0x4b, 0xad, 0x00, 0x8b, 0xbc, 0x33, 0x16, 0x49, 0xf6,
+    0xa5, 0x11, 0x8d, 0xb4, 0xbc, 0x28, 0xea, 0x1b, 0x34, 0x1e, 0xb7, 0x1e,
+    0xbf, 0x50, 0xe3, 0x60, 0xad, 0x41, 0xe0, 0x19, 0xfa, 0xa4, 0x23, 0x98,
+    0x48, 0x23, 0xad, 0xfa, 0xdb, 0x3c, 0x0a, 0x15, 0xeb, 0xf5, 0xf1, 0x43,
+    0xf2, 0xfd, 0x42, 0xf2, 0xd0, 0x3f, 0xa6, 0x3b, 0xc8, 0x81, 0x52, 0xba,
+    0xcf, 0x2d, 0xff, 0x2c, 0x24, 0x13, 0x62, 0x78, 0x01, 0xd8, 0xcb, 0xfc,
+    0xda, 0x70, 0x58, 0xad, 0xf1, 0xe6, 0x30, 0x47, 0x39, 0xc6, 0xf0, 0xbc,
+    0xe4, 0x89, 0x49, 0x46, 0x79, 0xde, 0xac, 0xde, 0xbd, 0x97, 0x18, 0x8f,
+    0x17, 0x07, 0xc1, 0xaf, 0xf8, 0xc1, 0x45, 0x95, 0x50, 0x36, 0x4d, 0x16,
+    0x35, 0x92, 0x2b, 0x5a, 0x71, 0x81, 0x59, 0xe5, 0x7f, 0xba, 0x10, 0xc9,
+    0x49, 0xd4, 0xeb, 0x64, 0x08, 0x54, 0x8b, 0xfa, 0xb3, 0xc8, 0x3a, 0xd7,
+    0xa6, 0xa9, 0xf2, 0xae, 0x04, 0xf8, 0x55, 0x5c, 0xff, 0x2d, 0x17, 0x53,
+    0x37, 0xc5, 0x36, 0xd8, 0x42, 0xd7, 0x47, 0xd8, 0x00, 0x99, 0x9c, 0x5d,
+    0x9f, 0x34, 0xc2, 0x09, 0x6b, 0x1a, 0xf3, 0x2f, 0xb0, 0xf8, 0x49, 0x54,
+    0x9d, 0x4b, 0xb8, 0xcf, 0xc5, 0x3b, 0x7f, 0x49, 0x9b, 0x40, 0xa9, 0xd3,
+    0x96, 0xe1, 0x6b, 0x87, 0x2d, 0x50, 0x76, 0x15, 0xd9, 0x9f, 0x87, 0x4f,
+    0x13, 0x26, 0xf2, 0xf8, 0xae, 0xd4, 0x63, 0x02, 0x0c, 0xcb, 0xe5, 0x63,
+    0x1c, 0x73, 0xdf, 0x57, 0x55, 0x16, 0x57, 0x3b, 0xfb, 0x9a, 0x06, 0x70,
+    0xfc, 0x9f, 0x29, 0x16, 0xec, 0x63, 0x34, 0x6f, 0x40, 0x1f, 0x54, 0x2a,
+    0xe7, 0x4a, 0x6f, 0xde, 0x86, 0xeb, 0x8c, 0x91, 0x3e, 0xfc, 0x6a, 0x48,
+    0xd1, 0x51, 0x33, 0xd7, 0xe1, 0x9d, 0xf8, 0x71, 0x21, 0x7b, 0x02, 0x38,
+    0x6a, 0xef, 0x30, 0x70, 0x38, 0x01, 0xc3, 0xef, 0x5d, 0x4f, 0xd3, 0x37,
+    0x2d, 0xe0, 0x4f, 0x4b, 0x72, 0xbc, 0xde, 0x9f, 0x32, 0x97, 0xe2, 0x55,
+    0x5e, 0x59, 0x5d, 0xa2, 0x9f, 0x5a, 0x04, 0x7c, 0x13, 0xe1, 0x35, 0x62,
+    0x4a, 0x10, 0x24, 0x55, 0x63, 0xb8, 0x8f, 0x66, 0xbc, 0x04, 0x08, 0x4e,
+    0xcc, 0xdc, 0x1f, 0x88, 0xc5, 0xcf, 0x8a, 0x7e, 0x24, 0x3e, 0x6f, 0x58,
+    0xcb, 0x44, 0x3c, 0x18, 0x64, 0xd9, 0x84, 0xa8, 0x1c, 0x0b, 0x20, 0xf4,
+    0x8b, 0x8b, 0x4b, 0xf8, 0x39, 0x8b, 0x01, 0x3a, 0x0b, 0x27, 0x67, 0xf8,
+    0x0f, 0xbd, 0xb3, 0x32, 0xce, 0xef, 0xbc, 0x8c, 0xa3, 0x31, 0xee, 0x0b,
+    0xdb, 0xc7, 0xc3, 0x43, 0x80, 0xe4, 0x7c, 0x9b, 0x89, 0xa4, 0x6b, 0x23,
+    0x2f, 0xa8, 0x28, 0xe0, 0x55, 0x30, 0x6e, 0xe7, 0xc9, 0x50, 0x1d, 0xbf,
+    0x67, 0xc8, 0x74, 0x58, 0x0f, 0xdb, 0xa6, 0x1f, 0xa6, 0xfd, 0xf0, 0x75,
+    0xea, 0x62, 0xd5, 0x44, 0xa2, 0x7e, 0xed, 0x63, 0xba, 0x7c, 0x5d, 0xb7,
+    0x16, 0x84, 0x30, 0x5d, 0xc2, 0xd3, 0x39, 0x61, 0x60, 0x0a, 0xb9, 0x34,
+    0x5e, 0x54, 0xf4, 0x34, 0x77, 0x22, 0x05, 0x41, 0x6b, 0x6a, 0x13, 0xc3,
+    0x10, 0x03, 0x8a, 0x78, 0xd2, 0x81, 0xac, 0x49, 0x31, 0xc8, 0xee, 0x15,
+    0xc3, 0x42, 0x3b, 0x00, 0xf6, 0x05, 0x92, 0x82, 0x6e, 0x73, 0xb4, 0xfa,
+    0xab, 0xe0, 0x2e, 0xe9, 0x5d, 0x89, 0x43, 0x0c, 0x4d, 0x88, 0x0c, 0xf1,
+    0xa4, 0x19, 0x59, 0xa0, 0x69, 0x0c, 0xfc, 0xf9, 0x9a, 0xbc, 0x3b, 0x2e,
+    0x3b, 0x29, 0xf8, 0xd7, 0x79, 0x11, 0xb2, 0x66, 0x26, 0x57, 0x34, 0x06,
+    0xb8, 0x36, 0x41, 0xca, 0x01, 0x10, 0xca, 0x06, 0xee, 0xb6, 0xf7, 0x1d,
+    0x0d, 0x88, 0xab, 0x07, 0xbe, 0x06, 0x8c, 0x1c, 0xa2, 0x76, 0x5e, 0xdb,
+    0x60, 0xa4, 0x43, 0x17, 0x31, 0xc3, 0x4b, 0x0a, 0x01, 0x80, 0xa7, 0xf6,
+    0xe6, 0x78, 0x64, 0x85, 0xb0, 0x8a, 0x28, 0x34, 0x82, 0x98, 0x29, 0x3f,
+    0xde, 0x07, 0x9a, 0x80, 0xcf, 0xe3, 0x6f, 0x23, 0x57, 0x79, 0x11, 0xb2,
+    0x61, 0x6d, 0x98, 0x26, 0xeb, 0x3b, 0xbf, 0xaa, 0x98, 0x62, 0xbb, 0xfd,
+    0x21, 0x76, 0xe5, 0xc5, 0xe0, 0x09, 0x21, 0x65, 0x72, 0x94, 0xd3, 0x8a,
+    0xcd, 0xfb, 0xec, 0x6e, 0x57, 0xd4, 0x2a, 0x92, 0xd1, 0xe9, 0x16, 0x46,
+    0xa2, 0x38, 0xae, 0x4b, 0x7e, 0xa7, 0x0c, 0x26, 0x9d, 0x96, 0xd7, 0x49,
+    0xa7, 0x02, 0x2b, 0x22, 0x9a, 0x39, 0x38, 0x11, 0xb8, 0xb3, 0xd5, 0x09,
+    0xf9, 0x70, 0xb4, 0x1c, 0x4e, 0xe3, 0xba, 0xa0, 0x78, 0x76, 0x6d, 0xc4,
+    0xab, 0x96, 0x3e, 0x98, 0x04, 0x4e, 0x50, 0x20, 0xd9, 0xfa, 0xea, 0xe2,
+    0x99, 0x50, 0x84, 0x20, 0x18, 0x69, 0xbb, 0x6e, 0x41, 0x9d, 0x18, 0x71,
+    0x15, 0x19, 0xd2, 0xf2, 0xa5, 0x69, 0x54, 0x8e, 0x60, 0x75, 0xd4, 0xe7,
+    0xdb, 0xe1, 0x43, 0xfd, 0x2e, 0x21, 0x4f, 0xff, 0x98, 0x8b, 0x08, 0x74,
+    0xca, 0x29, 0x7e, 0x3f, 0x2f, 0x6a, 0xf9, 0xe6, 0x49, 0x1d, 0xc6, 0x0b,
+    0x76, 0xc9, 0x22, 0xc3, 0x4f, 0xaf, 0xa8, 0xf9, 0xd6, 0x9c, 0x9a, 0x64,
+    0xec, 0xb3, 0x2c, 0x0f, 0x3e, 0x93, 0xc4, 0xb6, 0xd7, 0x36, 0x28, 0x04,
+    0xe5, 0x81, 0x48, 0x14, 0x9f, 0x4e, 0xc5, 0x9b, 0xd7, 0xc0, 0x0e, 0x35,
+    0xab, 0x49, 0xd3, 0x84, 0x9f, 0x5c, 0x93, 0x94, 0xa6, 0xd2, 0xb5, 0x83,
+    0x9d, 0x38, 0x0f, 0x85, 0x04, 0xa3, 0xb7, 0x23, 0x20, 0x93, 0x85, 0x48,
+    0x14, 0x0c, 0x22, 0x80, 0x92, 0x6c, 0xca, 0x3c, 0xc7, 0xfc, 0xa9, 0x88,
+    0x62, 0xbc, 0x2a, 0x91, 0x08, 0x5b, 0xb4, 0x60, 0xd1, 0x0f, 0x3c, 0x33,
+    0xc6, 0xe1, 0xf7, 0xca, 0xf7, 0xf9, 0xa1, 0x9b, 0xfa, 0xf7, 0x34, 0xe0,
+    0x54, 0xac, 0x53, 0x42, 0x30, 0x76, 0xc8, 0xc2, 0xcd, 0x61, 0x49, 0x87,
+    0x9c, 0x47, 0xf5, 0x98, 0xb5, 0x41, 0xf0, 0xad, 0xdb, 0x37, 0x06, 0xb8,
+    0x54, 0xa5, 0x26, 0x11, 0x4b, 0x18, 0xbb, 0xa4, 0xfb, 0x24, 0xd3, 0x14,
+    0x31, 0xfb, 0x56, 0x18, 0xd8, 0xc2, 0xd0, 0xd2, 0xab, 0xde, 0xdf, 0xa9,
+    0xdf, 0x9e, 0xa6, 0x56, 0x0d, 0x9f, 0xe4, 0x19, 0x15, 0x58, 0x18, 0xc6,
+    0x5e, 0x47, 0x05, 0x3a, 0x0e, 0x73, 0x68, 0x81, 0x39, 0x8c, 0x51, 0x1d,
+    0x04, 0x4e, 0x18, 0x54, 0xa5, 0x3e, 0x13, 0x4a, 0x15, 0xc2, 0x43, 0x90,
+    0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xbc, 0x69, 0x3e, 0x11, 0x46,
+    0x9d, 0xa4, 0xd3, 0x15, 0x80, 0xec, 0xe8, 0x31, 0x4f, 0x5a, 0x2a, 0x15,
+    0x3e, 0x7e, 0x7a, 0x44, 0x0e, 0x4a, 0xac, 0x9b, 0x46, 0x2f, 0x86, 0xf9,
+    0xea, 0x59, 0x4f, 0x15, 0xa0, 0x4b, 0xd1, 0xaa, 0xd8, 0x3a, 0x83, 0xb6,
+    0x25, 0x82, 0xb0, 0x44, 0x4a, 0x98, 0xbd, 0x10, 0xa2, 0xb0, 0x95, 0x02,
+    0xfa, 0x1f, 0xd3, 0x54, 0x1c, 0x0a, 0xb1, 0x31, 0x28, 0xec, 0x4c, 0xd2,
+    0x0c, 0xb9, 0xb0, 0xf4, 0x7a, 0x89, 0x63, 0x3c, 0x5f, 0xcf, 0x3c, 0xe8,
+    0xba, 0x21, 0x66, 0x20, 0x01, 0xcb, 0x1b, 0xc6, 0xf9, 0x54, 0x0f, 0xda,
+    0x4a, 0xcc, 0x81, 0x7b, 0x41, 0x81, 0xc0, 0x1f, 0xea, 0x9a, 0x9b, 0x96,
+    0x0d, 0x47, 0xdd, 0x16, 0x52, 0x5c, 0xaf, 0xae, 0x82, 0x3d, 0x18, 0x60,
+    0xfa, 0x34, 0xc2, 0x57, 0x2d, 0xc4, 0x2b, 0x2e, 0x41, 0xfe, 0xe7, 0x95,
+    0xcd, 0x1f, 0xbe, 0x88, 0x31, 0xc1, 0x07, 0x2c, 0xd3, 0xb1, 0xbb, 0xeb,
+    0x1d, 0xa3, 0x03, 0x1e, 0x70, 0xcc, 0x84, 0xe0, 0x65, 0x41, 0x0f, 0xf1,
+    0x7c, 0x95, 0x4b, 0x41, 0x43, 0x62, 0xad, 0x5d, 0xff, 0x4f, 0x92, 0xc8,
+    0xaa, 0x21, 0x23, 0xba, 0xa9, 0x90, 0xb5, 0xae, 0xc0, 0x1f, 0xae, 0x43,
+    0xf1, 0x79, 0x14, 0x30, 0x16, 0x1d, 0x2a, 0x6c, 0xd1, 0xd8, 0xb3, 0x38,
+    0x25, 0xd1, 0x66, 0xa5, 0x89, 0xc0, 0x8d, 0xc5, 0xa0, 0x6a, 0x7c, 0x64,
+    0xf8, 0x45, 0x1a, 0x76, 0x93, 0x4c, 0x56, 0x03, 0xb3, 0xa0, 0xc5, 0x40,
+    0xbc, 0x84, 0x98, 0x8d, 0xa4, 0xfe, 0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85,
+    0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b, 0x20, 0x02, 0x70, 0xbf, 0x72, 0x29,
+    0x0c, 0x0a, 0x9c, 0xac, 0x9c, 0x4d, 0xfa, 0x02, 0x5e, 0xe9, 0xe3, 0x52,
+    0x84, 0x54, 0x1f, 0xb7, 0xea, 0xb1, 0xc4, 0x2f, 0x69, 0xd1, 0x33, 0xc6,
+    0xb3, 0xee, 0xb0, 0x35, 0x1f, 0x19, 0x68, 0x2d, 0xef, 0xc1, 0xd3, 0x1c,
+    0xa8, 0x84, 0x54, 0x3c, 0x21, 0xed, 0x78, 0x35, 0x3f, 0x82, 0xb2, 0xa8,
+    0xe4, 0x25, 0x71, 0xfc, 0x1e, 0x1d, 0x36, 0xf4, 0xf4, 0x0f, 0x6f, 0x5b,
+    0xd9, 0x21, 0x13, 0x3a, 0x3d, 0x17, 0x45, 0x31, 0x78, 0x97, 0x99, 0x15,
+    0x87, 0xa9, 0xa6, 0x36, 0xf0, 0x20, 0xfa, 0xd5, 0x10, 0x01, 0x91, 0xa0,
+    0x4f, 0x28, 0x6a, 0x13, 0x04, 0xff, 0x97, 0x96, 0xf1, 0xfc, 0x1c, 0xc8,
+    0xcd, 0xe4, 0xbd, 0xe5, 0x40, 0x9a, 0x37, 0xc2, 0x01, 0x11, 0x2a, 0xc0,
+    0x0e, 0x58, 0x69, 0x29, 0xd0, 0x72, 0x26, 0x7c, 0x23, 0xec, 0x58, 0xfe,
+    0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f, 0x79, 0xb1, 0xfa, 0xac, 0x59, 0xe0,
+    0x78, 0x1c, 0xb4, 0x29, 0xee, 0x00, 0x39, 0x11, 0x0a, 0x2a, 0xb9, 0x98,
+    0x4e, 0xbf, 0x75, 0x9e, 0xe8, 0xbb, 0x4b, 0xe0, 0x6b, 0xab, 0x5b, 0x2f,
+    0x2d, 0xe3, 0xf8, 0x39, 0x91, 0x9b, 0xc9, 0x7b, 0xca, 0x81, 0x34, 0x6f,
+    0x84, 0x02, 0x22, 0x55, 0x80, 0x1c, 0xb0, 0xd2, 0x53, 0xa0, 0xe4, 0x4c,
+    0xf8, 0x47, 0xd8, 0xb1, 0xfd, 0x7a, 0x2b, 0x2f, 0xd0, 0x53, 0x3e, 0xf3,
+    0x63, 0xf5, 0x58, 0xb3, 0xc0, 0xf0, 0x39, 0x00, 0x08, 0x97, 0x4b, 0xe2,
+    0x46, 0x04, 0xa2, 0x39, 0x9c, 0xf2, 0x57, 0x17, 0x4a, 0xdd, 0x9f, 0x5e,
+    0xb1, 0x8b, 0x6b, 0x5d, 0x6e, 0x3e, 0x85, 0x34, 0x04, 0x96, 0x56, 0xe7,
+    0x4f, 0x6f, 0xd0, 0x31, 0xe7, 0x0c, 0xc8, 0x88, 0xdd, 0x5b, 0x14, 0x00,
+    0x60, 0x2a, 0x06, 0x18, 0xcd, 0x7f, 0xc9, 0xee, 0xd2, 0xd0, 0x8c, 0xc0,
+    0xed, 0x8f, 0x4a, 0x3e, 0x83, 0x52, 0x2e, 0x4a, 0xe9, 0xfa, 0x1f, 0x1a,
+    0xd5, 0xc0, 0x59, 0x4c, 0x8a, 0x2a, 0xab, 0x40, 0x2f, 0x84, 0xd2, 0x85,
+    0x70, 0x90, 0x96, 0xf3, 0x84, 0x6f, 0x1e, 0x81, 0x8c, 0x80, 0x03, 0x03,
+    0x2d, 0x36, 0x2e, 0x60, 0x79, 0x13, 0x63, 0x7f, 0xe7, 0xe3, 0x4a, 0x96,
+    0x08, 0xd8, 0x35, 0x15, 0x46, 0x8a, 0xe0, 0xb8, 0xc4, 0x7a, 0x28, 0x88,
+    0x52, 0xa8, 0x9a, 0xdd, 0x31, 0x65, 0xb2, 0x00, 0x24, 0xd9, 0xf4, 0x07,
+    0xea, 0xab, 0x7c, 0xe8, 0xa2, 0xea, 0xa7, 0x23, 0xd1, 0x93, 0x9e, 0xe7,
+    0x48, 0x34, 0x89, 0xf5, 0xb4, 0x45, 0x5e, 0xfa, 0xa6, 0xee, 0x32, 0x75,
+    0x8c, 0x56, 0x08, 0xcc, 0xeb, 0x5b, 0x05, 0xc2, 0x1d, 0x62, 0xa8, 0x5d,
+    0xaa, 0x50, 0xc2, 0x85, 0x85, 0x25, 0xb3, 0x5f, 0x60, 0xe7, 0x90, 0x1b,
+    0xa8, 0xb7, 0xf6, 0x83, 0x11, 0x07, 0x1f, 0xfc, 0xce, 0x58, 0x22, 0x8a,
+    0x3d, 0xa9, 0x8c, 0x18, 0x66, 0xa8, 0x32, 0x78, 0xa0, 0x16, 0x8a, 0xa2,
+    0x5d, 0x2f, 0x89, 0x18, 0x12, 0x88, 0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b,
+    0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad, 0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12,
+    0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40, 0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75,
+    0x6c, 0x50, 0x01, 0x80, 0xa8, 0x18, 0x63, 0x35, 0xff, 0x27, 0xbb, 0x4b,
+    0x42, 0x33, 0x03, 0xb6, 0x3d, 0x28, 0xfa, 0x0d, 0x48, 0xb9, 0x2b, 0xa7,
+    0xe8, 0x7c, 0x6b, 0x57, 0x01, 0x65, 0x32, 0x28, 0xaa, 0xad, 0x00, 0xbe,
+    0x13, 0x4a, 0x15, 0xc2, 0x42, 0x5b, 0xce, 0x11, 0xbc, 0x7a, 0x06, 0x32,
+    0x00, 0x0c, 0x0c, 0xb4, 0xd8, 0xb9, 0x81, 0xe4, 0x4d, 0x8d, 0xff, 0x9f,
+    0x8d, 0x2a, 0x58, 0x23, 0x60, 0xd4, 0x55, 0x1a, 0x2b, 0x82, 0xe3, 0x11,
+    0xe8, 0xa2, 0x21, 0x4a, 0xa2, 0x6b, 0x74, 0xc5, 0x96, 0xc8, 0x00, 0x93,
+    0x67, 0xd0, 0x1f, 0xaa, 0xad, 0xf3, 0xa2, 0x8b, 0xaa, 0x9c, 0x8f, 0x46,
+    0x4e, 0x7b, 0x9d, 0x20, 0xd2, 0x27, 0xd6, 0xd1, 0x15, 0x7b, 0xea, 0x9b,
+    0xb8, 0xc9, 0xd6, 0x31, 0x58, 0x23, 0x33, 0xad, 0x6c, 0x17, 0x08, 0x75,
+    0x8a, 0xa1, 0x76, 0xa9, 0x43, 0x0a, 0x16, 0x14, 0x96, 0xcd, 0x7d, 0x83,
+    0x9e, 0x40, 0x6e, 0xa2, 0xdf, 0xda, 0x0c, 0x44, 0x1c, 0x7f, 0xf3, 0x39,
+    0x60, 0x8a, 0x28, 0xf6, 0xa6, 0x30, 0x61, 0x9a, 0xa0, 0xc9, 0xe2, 0x80,
+    0x5a, 0x2a, 0x89, 0x74, 0xbe, 0x24, 0x60, 0x4a, 0x23, 0x99, 0xcf, 0x25,
+    0x71, 0x74, 0xad, 0xd9, 0xf5, 0xeb, 0x18, 0xb6, 0xb5, 0xd6, 0xe3, 0xe8,
+    0x53, 0x40, 0x49, 0x65, 0x6e, 0x74, 0xf6, 0xfd, 0x03, 0x1e, 0x70, 0xcc,
+    0x88, 0x8d, 0xd5, 0xb1, 0x40, 0x06, 0x02, 0xa0, 0x61, 0x8c, 0xd7, 0xfc,
+    0x9e, 0xed, 0x2d, 0x08, 0xcc, 0x0e, 0xd8, 0xf4, 0xa3, 0xe9, 0x41, 0x30,
+    0x05, 0xc8, 0xbd, 0x3c, 0xa4, 0xb7, 0x09, 0x6f, 0x9c, 0xc8, 0xa2, 0xaa,
+    0xb4, 0x02, 0xf8, 0x4d, 0x28, 0x57, 0x09, 0x09, 0x6f, 0x38, 0x46, 0xf1,
+    0xe8, 0x18, 0xc8, 0x00, 0x30, 0x32, 0xd3, 0x62, 0xe6, 0x07, 0x91, 0x36,
+    0x37, 0xfe, 0x7e, 0x34, 0xa9, 0x60, 0x8d, 0x83, 0x51, 0x54, 0x68, 0xae,
+    0x0b, 0x8c, 0x47, 0xa2, 0x88, 0x85, 0x2a, 0x89, 0xad, 0xd3, 0x16, 0x5b,
+    0x20, 0x02, 0x4f, 0xc0, 0x04, 0x8e, 0x38, 0xde, 0xd8, 0x95, 0xfc, 0x97,
+    0xd9, 0xd2, 0x15, 0xdb, 0x1a, 0xcc, 0x69, 0x02, 0xad, 0x4a, 0x5a, 0x70,
+    0x8b, 0xbf, 0xfc, 0x35, 0x6d, 0x3a, 0x0f, 0xc9, 0xea, 0x78, 0x1a, 0xd1,
+    0xcb, 0xb7, 0xaa, 0xb8, 0xf2, 0x44, 0xdf, 0xb3, 0xfe, 0x24, 0x83, 0xb9,
+    0x53, 0x94, 0x7e, 0xa5, 0xc5, 0x3f, 0xa2, 0x31, 0x3d, 0xdc, 0x0b, 0xb1,
+    0x24, 0x2f, 0x99, 0x4a, 0xd4, 0x0e, 0x6b, 0x3a, 0x34, 0x31, 0xc5, 0x87,
+    0x68, 0xbd, 0x61, 0xbd, 0xe2, 0xa0, 0xdb, 0x9a, 0x33, 0xfd, 0xc5, 0x10,
+    0x3f, 0xfb, 0xeb, 0xbd, 0x29, 0x03, 0x85, 0x8d, 0x08, 0x7b, 0xb6, 0xf7,
+    0xf0, 0xf5, 0x13, 0x69, 0x3e, 0x35, 0x68, 0x58, 0x50, 0xdb, 0x50, 0x13,
+    0x02, 0x3e, 0x81, 0x4b, 0x44, 0x6c, 0x75, 0x02, 0xe6, 0x90, 0x75, 0x6c,
+    0xc6, 0x7c, 0x23, 0xec, 0x58, 0xfe, 0xbd, 0x15, 0x97, 0xe8, 0x29, 0x9f,
+    0x80, 0x54, 0x65, 0xb8, 0x3c, 0x40, 0xe6, 0xdb, 0xbe, 0x51, 0x73, 0xe5,
+    0xf1, 0x23, 0x02, 0x51, 0x1c, 0xce, 0x79, 0x2b, 0x8b, 0xa5, 0x6e, 0xcf,
+    0xaf, 0x58, 0xc5, 0xb5, 0xae, 0xb7, 0x1f, 0x42, 0x9a, 0x02, 0x4b, 0x2b,
+    0x73, 0xa7, 0xb7, 0xe8, 0x18, 0xf3, 0x86, 0x64, 0x44, 0x6e, 0xad, 0x8a,
+    0x00, 0x30, 0x15, 0x03, 0x0c, 0x66, 0xbf, 0xe4, 0xf7, 0x69, 0x68, 0x46,
+    0x60, 0x76, 0xc7, 0xa5, 0x1f, 0x4a, 0x09, 0x80, 0x2e, 0x45, 0xe9, 0xe5,
+    0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2, 0x69,
+    0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40, 0x01,
+    0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1, 0xa5,
+    0x4b, 0x04, 0x6c, 0x1a, 0x8a, 0xa3, 0x45, 0x70, 0x5c, 0x62, 0x3d, 0x14,
+    0x44, 0x29, 0x54, 0x4d, 0x6e, 0x98, 0xb2, 0xd9, 0x00, 0x12, 0x7e, 0x00,
+    0x24, 0x71, 0xc6, 0xf6, 0xc4, 0xaf, 0xe4, 0xbe, 0xce, 0x90, 0xae, 0xd8,
+    0xd6, 0x63, 0x48, 0x15, 0x6a, 0x52, 0xd3, 0x84, 0x5d, 0xff, 0xe1, 0xab,
+    0x69, 0xd0, 0x7e, 0x4f, 0x53, 0xc0, 0xd6, 0x8e, 0x5d, 0xbd, 0x55, 0xc7,
+    0x92, 0x26, 0xfd, 0x9f, 0xf1, 0x24, 0x1d, 0xca, 0x9c, 0xa3, 0xf5, 0x2e,
+    0x29, 0xfd, 0x11, 0x89, 0xee, 0xe0, 0x5d, 0x89, 0x21, 0x7c, 0xca, 0x56,
+    0xa0, 0x73, 0x59, 0xd1, 0xa1, 0x8e, 0x2c, 0x3b, 0x45, 0xeb, 0x0d, 0xef,
+    0x15, 0x06, 0xdc, 0xd1, 0x9f, 0xee, 0x28, 0x81, 0xff, 0xdf, 0x5d, 0xe9,
+    0x48, 0x1c, 0x2c, 0x68, 0x43, 0xdd, 0xb7, 0xbf, 0x87, 0xa8, 0x9b, 0x49,
+    0xf1, 0xab, 0x42, 0xc2, 0x86, 0xda, 0x80, 0x98, 0x11, 0xf4, 0x0a, 0x5a,
+    0x23, 0x63, 0xa8, 0x17, 0x34, 0x83, 0xab, 0x66, 0x33, 0xe1, 0x1f, 0x62,
+    0xc7, 0xf5, 0xe8, 0xac, 0xbf, 0x41, 0x4c, 0xfc, 0x02, 0xa3, 0x2d, 0xc1,
+    0xe2, 0x07, 0x36, 0xdd, 0xf2, 0x8b, 0x9f, 0x2f, 0x89, 0x18, 0x12, 0x88,
+    0xe6, 0x73, 0xc9, 0x5c, 0x5d, 0x2b, 0x76, 0x7d, 0x7a, 0xc6, 0x2d, 0xad,
+    0x75, 0xb8, 0xfa, 0x14, 0xd0, 0x12, 0x59, 0x5b, 0x9d, 0x3d, 0xbf, 0x40,
+    0xc7, 0x9c, 0x33, 0x22, 0x23, 0x75, 0x6c, 0x50, 0x01, 0x80, 0xa8, 0x83,
+    0x06, 0xd4, 0xd6, 0x8d, 0x36, 0x78, 0xf9, 0x03, 0x23, 0xdb, 0x17, 0x90,
+    0x52, 0x0c, 0x5f, 0x1b, 0xe6, 0x44, 0x79, 0x52, 0xc5, 0x50, 0x17, 0x81,
+    0xf3, 0x1b, 0x88, 0xba, 0xfd, 0xbd, 0xa5, 0x51, 0x65, 0x6d, 0x33, 0x96,
+    0xc2, 0x71, 0x8d, 0x53, 0x1b, 0xab, 0xe9, 0xb9, 0xd0, 0x45, 0x61, 0xaf,
+    0xf9, 0xb7, 0x38, 0x55, 0x4f, 0xe9, 0x85, 0x1d, 0x4c, 0x0e, 0x40, 0x77,
+    0x03, 0xbc, 0x09, 0xd0, 0x37, 0xe3, 0xde, 0xf1, 0x0c, 0xa6, 0xc8, 0xd5,
+    0x63, 0x01, 0xfd, 0xe7, 0xc0, 0x9a, 0xe0, 0x98, 0x02, 0xe4, 0x5e, 0x9e,
+    0x52, 0x5b, 0x84, 0xb7, 0xce, 0x64, 0x51, 0x55, 0x5a, 0x01, 0x7c, 0x26,
+    0x94, 0x2b, 0x84, 0x84, 0xb7, 0x9c, 0x23, 0x78, 0xf4, 0x0c, 0x64, 0x00,
+    0x18, 0x19, 0x69, 0xb1, 0x73, 0x03, 0xc8, 0x9b, 0x1b, 0xff, 0x3f, 0x1a,
+    0x54, 0xb0, 0x46, 0xc1, 0xa8, 0xaa, 0x34, 0x57, 0x07, 0x13, 0xd3, 0x43,
+    0xb1, 0xaa, 0x4b, 0xc4, 0xcb, 0x5a, 0x9b, 0xa2, 0x23, 0x98, 0xa2, 0xd3,
+    0x2b, 0x8c, 0x7b, 0xf8, 0xc7, 0xaa, 0xf6, 0xcc, 0xb8, 0xfc, 0xb5, 0x77,
+    0xce, 0xff, 0x9d, 0x0e, 0xdb, 0x2b, 0x03, 0xc7, 0x42, 0x86, 0xf1, 0xcb,
+    0xa2, 0xa7, 0x85, 0x77, 0x58, 0x1a, 0x8f, 0x8c, 0xb4, 0x16, 0xf7, 0xe0,
+    0xe9, 0x8e, 0x54, 0x42, 0x2a, 0x1e, 0x10, 0xf6, 0xbc, 0x1a, 0x9f, 0xa1,
+    0xcb, 0xff, 0x13, 0x06, 0x88, 0x6b, 0xb1, 0xeb, 0x37, 0x26, 0xe5, 0x34,
+    0x0d, 0x73, 0x87, 0x91, 0x60, 0x6c, 0xd7, 0x2d, 0xc3, 0x5f, 0x40, 0x68,
+    0x45, 0x07, 0x6e, 0x62, 0xa9, 0xe3, 0x52, 0x75, 0xef, 0x14, 0xf5, 0x89,
+    0x0a, 0x3a, 0x57, 0x8b, 0xac, 0xbe, 0x86, 0x67, 0xd1, 0xd8, 0x35, 0xe5,
+    0xe7, 0x75, 0xb8, 0xf8, 0x28, 0x6d, 0xa8, 0x09, 0x81, 0x1f, 0x40, 0xa5,
+    0xa2, 0x36, 0x3a, 0x81, 0x73, 0x48, 0x3e, 0x8c, 0x9d, 0x1f, 0x78, 0xc5,
+    0x92, 0x36, 0x1a, 0xae, 0xdf, 0xda, 0xf8, 0x0a, 0x7e, 0x69, 0xcb, 0xaf,
+    0x74, 0x59, 0x49, 0x72, 0xa7, 0x97, 0x1c, 0x8c, 0xf0, 0x16, 0x01, 0x4a,
+    0xcc, 0x1a, 0xa1, 0x24, 0x83, 0x7b, 0x34, 0x65, 0x20, 0x51, 0x11, 0xae,
+    0x5d, 0xa7, 0x68, 0x9c, 0xec, 0x29, 0x27, 0xfc, 0x07, 0x49, 0xb4, 0x9b,
+    0x65, 0xb2, 0x51, 0x97, 0xae, 0xa5, 0x8a, 0x70, 0xe5, 0x53, 0xd3, 0xa2,
+    0x34, 0x35, 0xbd, 0xbf, 0x75, 0x64, 0xda, 0x88, 0x8c, 0xe9, 0xc3, 0x9a,
+    0x32, 0xf0, 0x5a, 0x96, 0xae, 0xef, 0x9a, 0xdd, 0x84, 0xc2, 0x97, 0x22,
+    0x2f, 0x06, 0x83, 0x32, 0x10, 0xff, 0x1d, 0x61, 0x60, 0x5f, 0x69, 0x10,
+    0x5d, 0x23, 0xc6, 0xf3, 0x3f, 0xa9, 0x53, 0xfe, 0xd0, 0x3e, 0x90, 0xe6,
+    0x54, 0x48, 0xab, 0x01, 0x76, 0x75, 0x88, 0x7b, 0x4e, 0xc6, 0xd0, 0x9b,
+    0x7a, 0xcd, 0x87, 0x36, 0x3e, 0x7e, 0x3d, 0xef, 0x10, 0xca, 0x6c, 0x8d,
+    0x56, 0x30, 0x1f, 0xde, 0x7c, 0x09, 0xae, 0x09, 0x80, 0x2e, 0x45, 0xe9,
+    0xe5, 0x25, 0xb8, 0x4b, 0x7c, 0xe6, 0x45, 0x15, 0x55, 0xa0, 0x17, 0xc2,
+    0x69, 0x42, 0xb8, 0x48, 0x4b, 0x79, 0xc2, 0x37, 0x8f, 0x40, 0xc6, 0x40,
+    0x01, 0x81, 0x96, 0x9b, 0x17, 0x30, 0x3c, 0x89, 0xb1, 0xbf, 0xf3, 0xf1,
+    0xa5, 0x5c, 0xdc, 0x1e, 0x69, 0xfc, 0xf1, 0xd8, 0x5d, 0xda, 0x13, 0x5b,
+    0xbc, 0x1f, 0x41, 0x4a, 0xde, 0x44, 0x3c, 0x5e, 0xbd, 0x46, 0xb7, 0xad,
+    0x32, 0xb8, 0xc7, 0xbf, 0x8c, 0x7a, 0xaf, 0x6c, 0xcb, 0x8f, 0xcb, 0x57,
+    0x7c, 0xef, 0xf9, 0xd0, 0xed, 0xb2, 0xb0, 0x3c, 0x74, 0x28, 0x6f, 0x1c,
+    0xba, 0x2a, 0x78, 0x57, 0x75, 0x81, 0xa8, 0xf8, 0xcb, 0x41, 0x6f, 0x7e,
+    0x0e, 0x98, 0xe5, 0x44, 0x22, 0xa2, 0x00, 0x6c, 0xba, 0xaf, 0x51, 0xcc,
+    0x9f, 0xba, 0x97, 0x39, 0xbb, 0x41, 0x60, 0xf0, 0xe9, 0xb7, 0xa7, 0xa0,
+    0x7b, 0x7a, 0xde, 0xc9, 0x22, 0x13, 0xf4, 0x04, 0xaf, 0x91, 0xf5, 0x37,
+    0x53, 0xad, 0x8d, 0x0d, 0x15, 0x7a, 0xf1, 0x81, 0x07, 0xd6, 0xa8, 0x80,
+    0x0c, 0x8d, 0x02, 0x79, 0x43, 0x50, 0x98, 0x27, 0xfc, 0xbc, 0xb7, 0x8f,
+    0xe0, 0xe6, 0x46, 0x6f, 0x25, 0xef, 0x2a, 0x04, 0xd1, 0xbe, 0x10, 0x3d,
+    0xb4, 0x43, 0x3e, 0xf7, 0xea, 0xf4, 0xb8, 0x24, 0xdc, 0x77, 0x4f, 0x52,
+    0x26, 0x55, 0xae, 0xbc, 0x6f, 0xe0, 0x8e, 0x41, 0x97, 0x82, 0xd4, 0xb5,
+    0x77, 0x7c, 0xd6, 0xec, 0x26, 0x14, 0xb9, 0x11, 0x78, 0x34, 0x19, 0x90,
+    0x87, 0xf8, 0xeb, 0x0b, 0x02, 0xfb, 0x48, 0x82, 0xe9, 0x1e, 0x37, 0x99,
+    0xfd, 0x4a, 0x9f, 0xf6, 0x81, 0xf4, 0x87, 0x32, 0xa2, 0x45, 0x58, 0x0b,
+    0xb3, 0xac, 0x43, 0xda, 0x76, 0x36, 0x84, 0xdb, 0xd6, 0x6c, 0x39, 0xb1,
+    0xf3, 0xf1, 0xef, 0x78, 0x86, 0x53, 0x64, 0x6a, 0xb1, 0x80, 0xfe, 0xf3,
+    0xe0, 0x4d, 0x70, 0x4c, 0x01, 0x72, 0x2f, 0x4f, 0x29, 0x2d, 0xc2, 0x5c,
+};
+static_assert(sizeof(kBytesTestReadSymbol16) == kNumBytesTestReadSymbol16, "");
diff --git a/src/utils/executor.cc b/src/utils/executor.cc
new file mode 100644
index 0000000..6934057
--- /dev/null
+++ b/src/utils/executor.cc
@@ -0,0 +1,21 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+
+Executor::~Executor() = default;
+
+}  // namespace libgav1
diff --git a/src/utils/executor.h b/src/utils/executor.h
new file mode 100644
index 0000000..21abdf8
--- /dev/null
+++ b/src/utils/executor.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_
+#define LIBGAV1_SRC_UTILS_EXECUTOR_H_
+
+#include <functional>
+
+namespace libgav1 {
+
+class Executor {
+ public:
+  virtual ~Executor();
+
+  // Schedules the specified "callback" for execution in this executor.
+  // Depending on the subclass implementation, this may block in some
+  // situations.
+  virtual void Schedule(std::function<void()> callback) = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_EXECUTOR_H_
diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake
new file mode 100644
index 0000000..587ca5d
--- /dev/null
+++ b/src/utils/libgav1_utils.cmake
@@ -0,0 +1,70 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_)
+  return()
+endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_
+set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1)
+
+list(APPEND libgav1_utils_sources
+            "${libgav1_source}/utils/array_2d.h"
+            "${libgav1_source}/utils/bit_mask_set.h"
+            "${libgav1_source}/utils/bit_reader.cc"
+            "${libgav1_source}/utils/bit_reader.h"
+            "${libgav1_source}/utils/block_parameters_holder.cc"
+            "${libgav1_source}/utils/block_parameters_holder.h"
+            "${libgav1_source}/utils/blocking_counter.h"
+            "${libgav1_source}/utils/common.h"
+            "${libgav1_source}/utils/compiler_attributes.h"
+            "${libgav1_source}/utils/constants.cc"
+            "${libgav1_source}/utils/constants.h"
+            "${libgav1_source}/utils/cpu.cc"
+            "${libgav1_source}/utils/cpu.h"
+            "${libgav1_source}/utils/dynamic_buffer.h"
+            "${libgav1_source}/utils/entropy_decoder.cc"
+            "${libgav1_source}/utils/entropy_decoder.h"
+            "${libgav1_source}/utils/executor.cc"
+            "${libgav1_source}/utils/executor.h"
+            "${libgav1_source}/utils/logging.cc"
+            "${libgav1_source}/utils/logging.h"
+            "${libgav1_source}/utils/memory.h"
+            "${libgav1_source}/utils/queue.h"
+            "${libgav1_source}/utils/raw_bit_reader.cc"
+            "${libgav1_source}/utils/raw_bit_reader.h"
+            "${libgav1_source}/utils/reference_info.h"
+            "${libgav1_source}/utils/segmentation.cc"
+            "${libgav1_source}/utils/segmentation.h"
+            "${libgav1_source}/utils/segmentation_map.cc"
+            "${libgav1_source}/utils/segmentation_map.h"
+            "${libgav1_source}/utils/stack.h"
+            "${libgav1_source}/utils/threadpool.cc"
+            "${libgav1_source}/utils/threadpool.h"
+            "${libgav1_source}/utils/types.h"
+            "${libgav1_source}/utils/unbounded_queue.h"
+            "${libgav1_source}/utils/vector.h")
+
+macro(libgav1_add_utils_targets)
+  libgav1_add_library(NAME
+                      libgav1_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_include_paths}
+                      ${libgav1_gtest_include_paths})
+
+endmacro()
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
new file mode 100644
index 0000000..26e3e15
--- /dev/null
+++ b/src/utils/logging.cc
@@ -0,0 +1,65 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <sstream>
+#include <thread>  // NOLINT (unapproved c++11 header)
+
+#if !defined(LIBGAV1_LOG_LEVEL)
+#define LIBGAV1_LOG_LEVEL (1 << 30)
+#endif
+
+namespace libgav1 {
+namespace internal {
+#if LIBGAV1_ENABLE_LOGGING
+namespace {
+
+const char* LogSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case LogSeverity::kInfo:
+      return "INFO";
+    case LogSeverity::kError:
+      return "ERROR";
+    case LogSeverity::kWarning:
+      return "WARNING";
+  }
+  return "UNKNOWN";
+}
+
+}  // namespace
+
+void Log(LogSeverity severity, const char* file, int line, const char* format,
+         ...) {
+  if (LIBGAV1_LOG_LEVEL < static_cast<int>(severity)) return;
+  std::ostringstream ss;
+  ss << std::hex << std::this_thread::get_id();
+  fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(),
+          file, line);
+
+  va_list ap;
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+}
+#else   // !LIBGAV1_ENABLE_LOGGING
+void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
+         const char* /*format*/, ...) {}
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+}  // namespace internal
+}  // namespace libgav1
diff --git a/src/utils/logging.h b/src/utils/logging.h
new file mode 100644
index 0000000..473aebd
--- /dev/null
+++ b/src/utils/logging.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_
+#define LIBGAV1_SRC_UTILS_LOGGING_H_
+
+#include <cstddef>
+
+#include "src/utils/compiler_attributes.h"
+
+#if !defined(LIBGAV1_ENABLE_LOGGING)
+#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_ENABLE_LOGGING
+// LIBGAV1_DLOG(severity, printf-format-string)
+// Debug logging that can optionally be enabled in release builds by explicitly
+// setting LIBGAV1_ENABLE_LOGGING.
+// Severity is given as an all-caps version of enum LogSeverity with the
+// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
+#define LIBGAV1_DLOG(severity, ...)                                     \
+  do {                                                                  \
+    constexpr const char* libgav1_logging_internal_basename =           \
+        libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
+    libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
+                           libgav1_logging_internal_basename, __LINE__, \
+                           __VA_ARGS__);                                \
+  } while (0)
+#else
+#define LIBGAV1_DLOG(severity, ...) \
+  do {                              \
+  } while (0)
+#endif  // LIBGAV1_ENABLE_LOGGING
+
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_WARNING \
+  libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
+
+namespace libgav1 {
+namespace internal {
+
+enum class LogSeverity : int {
+  kError,
+  kWarning,
+  kInfo,
+};
+
+// Helper function to implement LIBGAV1_DLOG
+// Logs |format, ...| at |severity| level, reporting it as called from
+// |file|:|line|.
+void Log(libgav1::internal::LogSeverity severity, const char* file, int line,
+         const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5);
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+}  // namespace internal
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_LOGGING_H_
diff --git a/src/utils/memory.h b/src/utils/memory.h
new file mode 100644
index 0000000..d1762a2
--- /dev/null
+++ b/src/utils/memory.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_
+#define LIBGAV1_SRC_UTILS_MEMORY_H_
+
+#if defined(__ANDROID__) || defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h>
+#endif
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+
+namespace libgav1 {
+
+enum {
+// The byte alignment required for buffers used with SIMD code to be read or
+// written with aligned operations.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+  kMaxAlignment = 32,  // extended alignment is safe on x86.
+#else
+  kMaxAlignment = alignof(max_align_t),
+#endif
+};
+
+// AlignedAlloc, AlignedFree
+//
+// void* AlignedAlloc(size_t alignment, size_t size);
+//   Allocate aligned memory.
+//   |alignment| must be a power of 2.
+//   Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*).
+//   Unlike aligned_alloc(), |size| does not need to be a multiple of
+//   |alignment|.
+//   The returned pointer should be freed by AlignedFree().
+//
+// void AlignedFree(void* aligned_memory);
+//   Free aligned memory.
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+  return _aligned_malloc(size, alignment);
+}
+
+inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); }
+
+#else  // !(defined(_MSC_VER) || defined(__MINGW32__))
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(__ANDROID__)
+  // Although posix_memalign() was introduced in Android API level 17, it is
+  // more convenient to use memalign(). Unlike glibc, Android does not consider
+  // memalign() an obsolete function.
+  return memalign(alignment, size);
+#else   // !defined(__ANDROID__)
+  void* ptr = nullptr;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const size_t required_alignment = sizeof(void*);
+  if (alignment < required_alignment) return malloc(size);
+  const int error = posix_memalign(&ptr, alignment, size);
+  if (error != 0) {
+    errno = error;
+    return nullptr;
+  }
+  return ptr;
+#endif  // defined(__ANDROID__)
+}
+
+inline void AlignedFree(void* aligned_memory) { free(aligned_memory); }
+
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+inline void Memset(uint8_t* const dst, int value, size_t count) {
+  memset(dst, value, count);
+}
+
+inline void Memset(uint16_t* const dst, int value, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dst[i] = static_cast<uint16_t>(value);
+  }
+}
+
+inline void Memset(int16_t* const dst, int value, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    dst[i] = static_cast<int16_t>(value);
+  }
+}
+
+struct MallocDeleter {
+  void operator()(void* ptr) const { free(ptr); }
+};
+
+struct AlignedDeleter {
+  void operator()(void* ptr) const { AlignedFree(ptr); }
+};
+
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Allocates aligned memory for an array of |count| elements of type T.
+template <typename T>
+inline AlignedUniquePtr<T> MakeAlignedUniquePtr(size_t alignment,
+                                                size_t count) {
+  return AlignedUniquePtr<T>(
+      static_cast<T*>(AlignedAlloc(alignment, count * sizeof(T))));
+}
+
+// A base class with custom new and delete operators. The exception-throwing
+// new operators are deleted. The "new (std::nothrow)" form must be used.
+//
+// The new operators return nullptr if the requested size is greater than
+// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size
+// a compile-time configuration macro.
+//
+// See https://en.cppreference.com/w/cpp/memory/new/operator_new and
+// https://en.cppreference.com/w/cpp/memory/new/operator_delete.
+//
+// NOTE: The allocation and deallocation functions are static member functions
+// whether the keyword 'static' is used or not.
+struct Allocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new(size, tag);
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+    return ::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept { ::operator delete(ptr); }
+  static void operator delete[](void* ptr) noexcept {
+    ::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+    ::operator delete[](ptr, tag);
+  }
+};
+
+// A variant of Allocable that forces allocations to be aligned to
+// kMaxAlignment bytes. This is intended for use with classes that use
+// alignas() with this value. C++17 aligned new/delete are used if available,
+// otherwise we use AlignedAlloc/Free.
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) = delete;
+  static void* operator new[](size_t size) = delete;
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new(size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    if (size > 0x40000000) return nullptr;
+#ifdef __cpp_aligned_new
+    return ::operator new[](size, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    return AlignedAlloc(kMaxAlignment, size);
+#endif
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+  static void operator delete[](void* ptr) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment));
+#else
+    AlignedFree(ptr);
+#endif
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+#ifdef __cpp_aligned_new
+    ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag);
+#else
+    static_cast<void>(tag);
+    AlignedFree(ptr);
+#endif
+  }
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_MEMORY_H_
diff --git a/src/utils/memory_test.cc b/src/utils/memory_test.cc
new file mode 100644
index 0000000..42f6a15
--- /dev/null
+++ b/src/utils/memory_test.cc
@@ -0,0 +1,184 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/memory.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+struct Small : public Allocable {
+  uint8_t x;
+};
+
+struct Huge : public Allocable {
+  uint8_t x[kMaxAllocableSize + 1];
+};
+
+struct SmallMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x;
+};
+
+struct HugeMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct ThrowingConstructor : public Allocable {
+  ThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+  MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+#endif
+
+TEST(MemoryTest, TestAlignedAllocFree) {
+  for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+    void* p = AlignedAlloc(alignment, 1);
+    // Note this additional check is to avoid an incorrect static-analysis
+    // warning for leaked memory with a plain ASSERT_NE().
+    if (p == nullptr) {
+      FAIL() << "AlignedAlloc(" << alignment << ", 1)";
+    }
+    const auto p_value = reinterpret_cast<uintptr_t>(p);
+    EXPECT_EQ(p_value % alignment, 0)
+        << "AlignedAlloc(" << alignment << ", 1) = " << p;
+    AlignedFree(p);
+  }
+}
+
+TEST(MemoryTest, TestAlignedUniquePtrAlloc) {
+  for (size_t alignment = 1; alignment <= 1 << 20; alignment <<= 1) {
+    auto p = MakeAlignedUniquePtr<uint8_t>(alignment, 1);
+    ASSERT_NE(p, nullptr) << "MakeAlignedUniquePtr(" << alignment << ", 1)";
+    const auto p_value = reinterpret_cast<uintptr_t>(p.get());
+    EXPECT_EQ(p_value % alignment, 0)
+        << "MakeAlignedUniquePtr(" << alignment << ", 1) = " << p.get();
+  }
+}
+
+TEST(MemoryTest, TestAllocable) {
+  // Allocable::operator new (std::nothrow) is called.
+  std::unique_ptr<Small> small(new (std::nothrow) Small);
+  EXPECT_NE(small, nullptr);
+  // Allocable::operator delete is called.
+  small = nullptr;
+
+  // Allocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<Small[]> small_array_of_smalls(new (std::nothrow) Small[10]);
+  EXPECT_NE(small_array_of_smalls, nullptr);
+  // Allocable::operator delete[] is called.
+  small_array_of_smalls = nullptr;
+
+  // Allocable::operator new (std::nothrow) is called.
+  std::unique_ptr<Huge> huge(new (std::nothrow) Huge);
+  EXPECT_EQ(huge, nullptr);
+
+  // Allocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<Small[]> huge_array_of_smalls(
+      new (std::nothrow) Small[kMaxAllocableSize / sizeof(Small) + 1]);
+  EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // Allocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // Allocable::operator delete (std::nothrow) is called.
+    ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // Allocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // Allocable::operator delete[] (std::nothrow) is called.
+    ThrowingConstructor* always = new (std::nothrow) ThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+TEST(MemoryTest, TestMaxAlignedAllocable) {
+  // MaxAlignedAllocable::operator new (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+  EXPECT_NE(small, nullptr);
+  // Note this check doesn't guarantee conformance as a suitably aligned
+  // address may be returned from any allocator.
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1), 0);
+  // MaxAlignedAllocable::operator delete is called.
+  small = nullptr;
+
+  // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+      new (std::nothrow) SmallMaxAligned[10]);
+  EXPECT_NE(small_array_of_smalls, nullptr);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                (kMaxAlignment - 1),
+            0);
+  // MaxAlignedAllocable::operator delete[] is called.
+  small_array_of_smalls = nullptr;
+
+  // MaxAlignedAllocable::operator new (std::nothrow) is called.
+  std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+  EXPECT_EQ(huge, nullptr);
+
+  // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+  std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+      new (std::nothrow)
+          SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+  EXPECT_EQ(huge_array_of_smalls, nullptr);
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/queue.h b/src/utils/queue.h
new file mode 100644
index 0000000..fcc7bfe
--- /dev/null
+++ b/src/utils/queue.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// A FIFO queue of a fixed capacity.
+//
+// WARNING: No error checking is performed.
+template <typename T>
+class Queue {
+ public:
+  LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) {
+    elements_.reset(new (std::nothrow) T[capacity]);
+    if (elements_ == nullptr) return false;
+    capacity_ = capacity;
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(T&& value) {
+    assert(size_ < capacity_);
+    elements_[end_++] = std::move(value);
+    if (end_ == capacity_) end_ = 0;
+    ++size_;
+  }
+
+  // Removes the element at the front of the queue. It is an error to call Pop()
+  // when the queue is empty.
+  void Pop() {
+    assert(size_ != 0);
+    const T element = std::move(elements_[begin_++]);
+    static_cast<void>(element);
+    if (begin_ == capacity_) begin_ = 0;
+    --size_;
+  }
+
+  // Returns a reference to the element at the front of the queue. It is an
+  // error to call Front() when the queue is empty.
+  T& Front() {
+    assert(size_ != 0);
+    return elements_[begin_];
+  }
+
+  // Returns a reference to the element at the back of the queue. It is an error
+  // to call Back() when the queue is empty.
+  T& Back() {
+    assert(size_ != 0);
+    const size_t back = ((end_ == 0) ? capacity_ : end_) - 1;
+    return elements_[back];
+  }
+
+  // Clears the queue.
+  void Clear() {
+    while (!Empty()) {
+      Pop();
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return size_ == 0; }
+
+  // Returns true if the queue is full.
+  bool Full() const { return size_ >= capacity_; }
+
+  // Returns the number of elements in the queue.
+  size_t Size() const { return size_; }
+
+ private:
+  // An array of |capacity| elements. Used as a circular array.
+  std::unique_ptr<T[]> elements_;
+  size_t capacity_ = 0;
+  // The index of the element to be removed by Pop().
+  size_t begin_ = 0;
+  // The index where the new element is inserted by Push().
+  size_t end_ = 0;
+  size_t size_ = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_QUEUE_H_
diff --git a/src/utils/queue_test.cc b/src/utils/queue_test.cc
new file mode 100644
index 0000000..d84ae5f
--- /dev/null
+++ b/src/utils/queue_test.cc
@@ -0,0 +1,86 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/queue.h"
+
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+struct TestClass {
+  TestClass() = default;
+  explicit TestClass(int i) : i(i) {}
+  int i;
+  // The vector exists simply so that the class is not trivially copyable.
+  std::vector<int> dummy;
+};
+
+TEST(QueueTest, Basic) {
+  Queue<TestClass> queue;
+  ASSERT_TRUE(queue.Init(8));
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+  }
+  EXPECT_TRUE(queue.Full());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front().i, i);
+    queue.Pop();
+    EXPECT_FALSE(queue.Full());
+  }
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+  }
+  EXPECT_TRUE(queue.Full());
+  queue.Clear();
+  EXPECT_TRUE(queue.Empty());
+  EXPECT_FALSE(queue.Full());
+}
+
+TEST(QueueTest, WrapAround) {
+  Queue<TestClass> queue;
+  ASSERT_TRUE(queue.Init(8));
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 100; ++i) {
+    EXPECT_FALSE(queue.Full());
+    TestClass test(i);
+    queue.Push(std::move(test));
+    EXPECT_EQ(queue.Back().i, i);
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front().i, i);
+    queue.Pop();
+    EXPECT_TRUE(queue.Empty());
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc
new file mode 100644
index 0000000..15e980d
--- /dev/null
+++ b/src/utils/raw_bit_reader.cc
@@ -0,0 +1,224 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <cassert>
+#include <limits>
+
+#include "src/utils/common.h"
+#include "src/utils/logging.h"
+
+// Note <cinttypes> is only needed when logging is enabled (for the PRI*
+// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from
+// logging.h, thus the non-standard header ordering.
+#if LIBGAV1_ENABLE_LOGGING
+#include <cinttypes>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr int kMaximumLeb128Size = 8;
+constexpr uint8_t kLeb128ValueByteMask = 0x7f;
+constexpr uint8_t kLeb128TerminationByteMask = 0x80;
+
+uint8_t Mod8(size_t n) {
+  // Last 3 bits are the value of mod 8.
+  return n & 0x07;
+}
+
+size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; }
+
+}  // namespace
+
+RawBitReader::RawBitReader(const uint8_t* data, size_t size)
+    : data_(data), bit_offset_(0), size_(size) {
+  assert(data_ != nullptr || size_ == 0);
+}
+
+int RawBitReader::ReadBitImpl() {
+  const size_t byte_offset = DivideBy8(bit_offset_, false);
+  const uint8_t byte = data_[byte_offset];
+  const uint8_t shift = 7 - Mod8(bit_offset_);
+  ++bit_offset_;
+  return static_cast<int>((byte >> shift) & 0x01);
+}
+
+int RawBitReader::ReadBit() {
+  if (Finished()) return -1;
+  return ReadBitImpl();
+}
+
+int64_t RawBitReader::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  if (!CanReadLiteral(num_bits)) return -1;
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBitImpl()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBitImpl());
+  } while (--bit >= 0);
+  return literal;
+}
+
+bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) {
+  assert(num_bits + 1 < 32);
+  *value = static_cast<int>(ReadLiteral(num_bits + 1));
+  if (*value == -1) return false;
+  const int sign_bit = 1 << num_bits;
+  if ((*value & sign_bit) != 0) {
+    *value -= 2 * sign_bit;
+  }
+  return true;
+}
+
+bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  assert(num_bytes <= 4);
+  static_assert(sizeof(size_t) >= 4, "");
+  if (value == nullptr) return false;
+  size_t byte_offset = DivideBy8(bit_offset_, false);
+  if (Finished() || byte_offset + num_bytes > size_) {
+    LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value.");
+    return false;
+  }
+  *value = 0;
+  for (int i = 0; i < num_bytes; ++i) {
+    const size_t byte = data_[byte_offset];
+    *value |= (byte << (i * 8));
+    ++byte_offset;
+  }
+  bit_offset_ = byte_offset * 8;
+  return true;
+}
+
+bool RawBitReader::ReadUnsignedLeb128(size_t* const value) {
+  // We must be at a byte boundary.
+  assert(Mod8(bit_offset_) == 0);
+  if (value == nullptr) return false;
+  uint64_t value64 = 0;
+  for (int i = 0; i < kMaximumLeb128Size; ++i) {
+    if (Finished()) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value.");
+      return false;
+    }
+    const size_t byte_offset = DivideBy8(bit_offset_, false);
+    const uint8_t byte = data_[byte_offset];
+    bit_offset_ += 8;
+    value64 |= static_cast<uint64_t>(byte & kLeb128ValueByteMask) << (i * 7);
+    if ((byte & kLeb128TerminationByteMask) == 0) {
+      if (value64 != static_cast<size_t>(value64) ||
+          value64 > std::numeric_limits<uint32_t>::max()) {
+        LIBGAV1_DLOG(
+            ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).",
+            value64, std::numeric_limits<uint32_t>::max());
+        return false;
+      }
+      *value = static_cast<size_t>(value64);
+      return true;
+    }
+  }
+  LIBGAV1_DLOG(
+      ERROR,
+      "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value",
+      kMaximumLeb128Size);
+  return false;
+}
+
+bool RawBitReader::ReadUvlc(uint32_t* const value) {
+  if (value == nullptr) return false;
+  int leading_zeros = 0;
+  while (true) {
+    const int bit = ReadBit();
+    if (bit == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    if (bit == 1) break;
+    ++leading_zeros;
+    if (leading_zeros == 32) {
+      LIBGAV1_DLOG(ERROR,
+                   "Exceeded maximum size (32) when trying to read uvlc value");
+      return false;
+    }
+  }
+  int literal;
+  if (leading_zeros != 0) {
+    literal = static_cast<int>(ReadLiteral(leading_zeros));
+    if (literal == -1) {
+      LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value.");
+      return false;
+    }
+    literal += (1U << leading_zeros) - 1;
+  } else {
+    literal = 0;
+  }
+  *value = literal;
+  return true;
+}
+
+bool RawBitReader::AlignToNextByte() {
+  while ((bit_offset_ & 7) != 0) {
+    if (ReadBit() != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) {
+  if (ReadBit() != 1) return false;
+  for (size_t i = 0; i < num_bits - 1; ++i) {
+    if (ReadBit() != 0) return false;
+  }
+  return true;
+}
+
+bool RawBitReader::SkipBytes(size_t num_bytes) {
+  // If we are not at a byte boundary, return false.
+  return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8);
+}
+
+bool RawBitReader::SkipBits(size_t num_bits) {
+  // If the reader is already finished, return false.
+  if (Finished()) return false;
+  // If skipping |num_bits| runs out of buffer, return false.
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  if (DivideBy8(bit_offset, false) >= size_) return false;
+  bit_offset_ += num_bits;
+  return true;
+}
+
+bool RawBitReader::CanReadLiteral(size_t num_bits) const {
+  if (Finished()) return false;
+  const size_t bit_offset = bit_offset_ + num_bits - 1;
+  return DivideBy8(bit_offset, false) < size_;
+}
+
+bool RawBitReader::Finished() const {
+  return DivideBy8(bit_offset_, false) >= size_;
+}
+
+}  // namespace libgav1
diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h
new file mode 100644
index 0000000..da770d1
--- /dev/null
+++ b/src/utils/raw_bit_reader.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "src/utils/bit_reader.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+class RawBitReader final : public BitReader, public Allocable {
+ public:
+  RawBitReader(const uint8_t* data, size_t size);
+  ~RawBitReader() override = default;
+
+  int ReadBit() override;
+  int64_t ReadLiteral(int num_bits) override;  // f(n) in the spec.
+  bool ReadInverseSignedLiteral(int num_bits,
+                                int* value);  // su(1+num_bits) in the spec.
+  bool ReadLittleEndian(int num_bytes,
+                        size_t* value);    // le(n) in the spec.
+  bool ReadUnsignedLeb128(size_t* value);  // leb128() in the spec.
+  // Reads a variable length unsigned number and stores it in |*value|. On a
+  // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
+  // inclusive.
+  bool ReadUvlc(uint32_t* value);  // uvlc() in the spec.
+  bool Finished() const;
+  size_t bit_offset() const { return bit_offset_; }
+  // Return the bytes consumed so far (rounded up).
+  size_t byte_offset() const { return (bit_offset() + 7) >> 3; }
+  size_t size() const { return size_; }
+  // Move to the next byte boundary if not already at one. Return false if any
+  // of the bits being skipped over is non-zero. Return true otherwise. If this
+  // function returns false, the reader is left in an undefined state and must
+  // not be used further. section 5.3.5.
+  bool AlignToNextByte();
+  // Make sure that the trailing bits structure is as expected and skip over it.
+  // section 5.3.4.
+  bool VerifyAndSkipTrailingBits(size_t num_bits);
+  // Skip |num_bytes| bytes. This only works if the current position is at a
+  // byte boundary. The function returns false if the current position is not at
+  // a byte boundary or if skipping |num_bytes| causes the reader to run out of
+  // buffer. Returns true otherwise.
+  bool SkipBytes(size_t num_bytes);
+  // Skip |num_bits| bits. The function returns false if skipping |num_bits|
+  // causes the reader to run out of buffer. Returns true otherwise.
+  bool SkipBits(size_t num_bits);
+
+ private:
+  // Returns true if it is safe to read a literal of size |num_bits|.
+  bool CanReadLiteral(size_t num_bits) const;
+  int ReadBitImpl();
+
+  const uint8_t* const data_;
+  size_t bit_offset_;
+  const size_t size_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_
diff --git a/src/utils/raw_bit_reader_test.cc b/src/utils/raw_bit_reader_test.cc
new file mode 100644
index 0000000..22a97a7
--- /dev/null
+++ b/src/utils/raw_bit_reader_test.cc
@@ -0,0 +1,580 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/raw_bit_reader.h"
+
+#include <bitset>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+std::string IntegerToString(int x) { return std::bitset<8>(x).to_string(); }
+
+class RawBitReaderTest : public testing::TestWithParam<std::tuple<int, int>> {
+ protected:
+  RawBitReaderTest()
+      : literal_size_(std::get<0>(GetParam())),
+        test_data_size_(std::get<1>(GetParam())) {}
+
+  void CreateReader(const std::vector<uint8_t>& data) {
+    data_ = data;
+    raw_bit_reader_.reset(new (std::nothrow)
+                              RawBitReader(data_.data(), data_.size()));
+  }
+
+  void CreateReader(int size) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    data_.clear();
+    for (int i = 0; i < size; ++i) {
+      data_.push_back(rnd.Rand8());
+    }
+    raw_bit_reader_.reset(new (std::nothrow)
+                              RawBitReader(data_.data(), data_.size()));
+  }
+
+  // Some tests don't depend on |literal_size_|. For those tests, return true if
+  // the |literal_size_| is greater than 1. If this function returns true, the
+  // test will abort.
+  bool RunOnlyOnce() const { return literal_size_ > 1; }
+
+  std::unique_ptr<RawBitReader> raw_bit_reader_;
+  std::vector<uint8_t> data_;
+  int literal_size_;
+  int test_data_size_;
+};
+
+TEST_P(RawBitReaderTest, ReadBit) {
+  if (RunOnlyOnce()) return;
+  CreateReader(test_data_size_);
+  for (const auto& value : data_) {
+    const std::string expected = IntegerToString(value);
+    for (int j = 0; j < 8; ++j) {
+      EXPECT_FALSE(raw_bit_reader_->Finished());
+      EXPECT_EQ(static_cast<int>(expected[j] == '1'),
+                raw_bit_reader_->ReadBit());
+    }
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral) {
+  const int size_bytes = literal_size_;
+  const int size_bits = 8 * size_bytes;
+  CreateReader(test_data_size_ * size_bytes);
+  for (size_t i = 0; i < data_.size(); i += size_bytes) {
+    uint32_t expected_literal = 0;
+    for (int j = 0; j < size_bytes; ++j) {
+      expected_literal |=
+          static_cast<uint32_t>(data_[i + j] << (8 * (size_bytes - j - 1)));
+    }
+    EXPECT_FALSE(raw_bit_reader_->Finished());
+    const int64_t actual_literal = raw_bit_reader_->ReadLiteral(size_bits);
+    EXPECT_EQ(static_cast<int64_t>(expected_literal), actual_literal);
+    EXPECT_GE(actual_literal, 0);
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteral32BitsWithMsbSet) {
+  if (RunOnlyOnce()) return;
+  // Three 32-bit values with MSB set.
+  CreateReader({0xff, 0xff, 0xff, 0xff,    // 4294967295
+                0x80, 0xff, 0xee, 0xdd,    // 2164256477
+                0xa0, 0xaa, 0xbb, 0xcc});  // 2695543756
+  static constexpr int64_t expected_literals[] = {4294967295, 2164256477,
+                                                  2695543756};
+  for (const int64_t expected_literal : expected_literals) {
+    EXPECT_FALSE(raw_bit_reader_->Finished());
+    const int64_t actual_literal = raw_bit_reader_->ReadLiteral(32);
+    EXPECT_EQ(expected_literal, actual_literal);
+    EXPECT_GE(actual_literal, 0);
+  }
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(10), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralNotEnoughBits) {
+  if (RunOnlyOnce()) return;
+  CreateReader(4);  // 32 bits.
+  EXPECT_GE(raw_bit_reader_->ReadLiteral(16), 0);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadLiteralMaxNumBits) {
+  if (RunOnlyOnce()) return;
+  CreateReader(4);  // 32 bits.
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(32), -1);
+}
+
+TEST_P(RawBitReaderTest, ReadInverseSignedLiteral) {
+  if (RunOnlyOnce()) return;
+  // This is the only usage for this function in the decoding process. So
+  // testing just that case.
+  const int size_bits = 6;
+  data_.clear();
+  // Negative value followed by a positive value.
+  data_.push_back(0xd2);
+  data_.push_back(0xa4);
+  raw_bit_reader_.reset(new (std::nothrow)
+                            RawBitReader(data_.data(), data_.size()));
+  int value;
+  EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+  EXPECT_EQ(value, -23);
+  EXPECT_TRUE(raw_bit_reader_->ReadInverseSignedLiteral(size_bits, &value));
+  EXPECT_EQ(value, 41);
+  // We have only two bits left. Trying to read an inverse signed literal of 2
+  // bits actually needs 3 bits. So this should fail.
+  EXPECT_FALSE(raw_bit_reader_->ReadInverseSignedLiteral(2, &value));
+}
+
+TEST_P(RawBitReaderTest, ZeroSize) {
+  if (RunOnlyOnce()) return;
+  // Valid data, zero size.
+  data_.clear();
+  data_.push_back(0xf0);
+  raw_bit_reader_.reset(new (std::nothrow) RawBitReader(data_.data(), 0));
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+  // NULL data, zero size.
+  raw_bit_reader_.reset(new (std::nothrow) RawBitReader(nullptr, 0));
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->ReadLiteral(2), -1);
+}
+
+TEST_P(RawBitReaderTest, AlignToNextByte) {
+  if (RunOnlyOnce()) return;
+  CreateReader({0x00, 0x00, 0x00, 0x0f});
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 0);
+  EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 1);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 24);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 3);
+  EXPECT_NE(raw_bit_reader_->ReadBit(), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 25);
+  EXPECT_EQ(raw_bit_reader_->byte_offset(), 4);
+  // Some bits are non-zero.
+  EXPECT_FALSE(raw_bit_reader_->AlignToNextByte());
+}
+
+TEST_P(RawBitReaderTest, VerifyAndSkipTrailingBits) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+
+  // 1 byte trailing byte.
+  data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+  // 2 byte trailing byte beginning at a byte-aligned offset.
+  data.clear();
+  data.push_back(0xf8);
+  data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+
+  // 2 byte trailing byte beginning at a non-byte-aligned offset.
+  data.clear();
+  data.push_back(0xf8);
+  data.push_back(0x00);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+  EXPECT_TRUE(raw_bit_reader_->VerifyAndSkipTrailingBits(4));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+
+  // Invalid trailing byte at a byte-aligned offset.
+  data.clear();
+  data.push_back(0xf7);
+  data.push_back(0x70);
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(8), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+
+  // Invalid trailing byte at a non-byte-aligned offset.
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(4), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 4);
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(12));
+
+  // No more data available.
+  CreateReader(data);
+  EXPECT_NE(raw_bit_reader_->ReadLiteral(16), -1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_FALSE(raw_bit_reader_->VerifyAndSkipTrailingBits(8));
+}
+
+TEST_P(RawBitReaderTest, ReadLittleEndian) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  size_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(1, nullptr));
+
+  // One byte value.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(1, &actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // One byte value with leading bytes.
+  data.clear();
+  data.push_back(0x01);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+  EXPECT_EQ(actual, 473);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value with leading bytes.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadLittleEndian(4, &actual));
+  EXPECT_EQ(actual, 473);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Not enough bytes.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadLittleEndian(2, &actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUnsignedLeb128) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  size_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(nullptr));
+
+  // One byte value.
+  data.clear();
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // One byte value with trailing bytes.
+  data.clear();
+  data.push_back(0x81);
+  data.push_back(0x80);
+  data.push_back(0x80);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 32);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x01);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 217);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Two byte value with trailing bytes.
+  data.clear();
+  data.push_back(0xD9);
+  data.push_back(0x81);
+  data.push_back(0x80);
+  data.push_back(0x80);
+  data.push_back(0x00);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+  EXPECT_EQ(actual, 217);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 40);
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+
+  // Value > 32 bits.
+  data.clear();
+  for (int i = 0; i < 5; ++i) data.push_back(0xD9);
+  data.push_back(0x00);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+  // Not enough bytes (truncated leb128 value).
+  data.clear();
+  data.push_back(0x81);
+  data.push_back(0x81);
+  data.push_back(0x81);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+
+  // Exceeds kMaximumLeb128Size.
+  data.clear();
+  for (int i = 0; i < 10; ++i) data.push_back(0x80);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUnsignedLeb128(&actual));
+}
+
+TEST_P(RawBitReaderTest, ReadUvlc) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  uint32_t actual;
+
+  // Invalid input.
+  data.push_back(0x00);  // dummy.
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(nullptr));
+
+  // Zero bit value.
+  data.clear();
+  data.push_back(0x80);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 1);
+
+  // One bit value.
+  data.clear();
+  data.push_back(0x60);  // 011...
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 2);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 3);
+
+  // Two bit value.
+  data.clear();
+  data.push_back(0x38);  // 00111...
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, 6);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 5);
+
+  // 31 bit value.
+  data.clear();
+  // (1 << 32) - 2 (= UINT32_MAX - 1) is the largest value that can be encoded
+  // as uvlc().
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x01);
+  data.push_back(0xFF);
+  data.push_back(0xFF);
+  data.push_back(0xFF);
+  data.push_back(0xFE);
+  CreateReader(data);
+  ASSERT_TRUE(raw_bit_reader_->ReadUvlc(&actual));
+  EXPECT_EQ(actual, UINT32_MAX - 1);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 63);
+
+  // Not enough bits (truncated uvlc value).
+  data.clear();
+  data.push_back(0x07);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+  // 32 bits.
+  data.clear();
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0xFF);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+
+  // Exceeds 32 bits.
+  data.clear();
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x00);
+  data.push_back(0x0F);
+  CreateReader(data);
+  EXPECT_FALSE(raw_bit_reader_->ReadUvlc(&actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeSignedSubexpWithReference) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data;
+  int actual;
+
+  data.push_back(0xa0);  // v = 5;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 20, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 12);
+
+  data.clear();
+  data.push_back(0xd0);  // v = 6; extra_bit = 1;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 20, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 11);
+
+  data.clear();
+  data.push_back(0xc8);  // subexp_more_bits = 1; v = 9;
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 40, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 27);
+
+  data.clear();
+  data.push_back(0x60);  // subexp_more_bits = 0; subexp_bits = 6.
+  CreateReader(data);
+  EXPECT_TRUE(raw_bit_reader_->DecodeSignedSubexpWithReference(
+      10, 40, 15, kGlobalMotionReadControl, &actual));
+  EXPECT_EQ(actual, 18);
+
+  data.clear();
+  data.push_back(0x60);
+  CreateReader(data);
+  // Control is greater than 32, which makes b >= 32 in DecodeSubexp() and
+  // should return false.
+  EXPECT_FALSE(raw_bit_reader_->DecodeSignedSubexpWithReference(10, 40, 15, 35,
+                                                                &actual));
+}
+
+TEST_P(RawBitReaderTest, DecodeUniform) {
+  if (RunOnlyOnce()) return;
+  // Test the example from the AV1 spec, Section 4.10.7. ns(n).
+  // n = 5
+  // Value            ns(n) encoding
+  // -------------------------------
+  // 0                 00
+  // 1                 01
+  // 2                 10
+  // 3                110
+  // 4                111
+  //
+  // The five encoded values are concatenated into two bytes.
+  std::vector<uint8_t> data = {0x1b, 0x70};
+  CreateReader(data);
+  int actual;
+  for (int i = 0; i < 5; ++i) {
+    EXPECT_TRUE(raw_bit_reader_->DecodeUniform(5, &actual));
+    EXPECT_EQ(actual, i);
+  }
+
+  // If n is a power of 2, ns(n) is simply the log2(n)-bit representation of
+  // the unsigned number.
+  // Test n = 16.
+  // The 16 encoded values are concatenated into 8 bytes.
+  data = {0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
+  CreateReader(data);
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_TRUE(raw_bit_reader_->DecodeUniform(16, &actual));
+    EXPECT_EQ(actual, i);
+  }
+}
+
+TEST_P(RawBitReaderTest, SkipBytes) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->SkipBytes(1));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+  EXPECT_FALSE(raw_bit_reader_->SkipBytes(1));  // Not at a byte boundary.
+  EXPECT_TRUE(raw_bit_reader_->AlignToNextByte());
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_FALSE(raw_bit_reader_->SkipBytes(10));  // Not enough bytes.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 16);
+  EXPECT_TRUE(raw_bit_reader_->SkipBytes(3));
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+TEST_P(RawBitReaderTest, SkipBits) {
+  if (RunOnlyOnce()) return;
+  std::vector<uint8_t> data = {0x00, 0x00, 0x00, 0x00, 0x00};
+  CreateReader(data);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 0);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(8));
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 8);
+  EXPECT_GE(raw_bit_reader_->ReadBit(), 0);
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 9);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(10));  // Not at a byte boundary.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+  EXPECT_FALSE(raw_bit_reader_->SkipBits(80));  // Not enough bytes.
+  EXPECT_EQ(raw_bit_reader_->bit_offset(), 19);
+  EXPECT_TRUE(raw_bit_reader_->SkipBits(21));
+  EXPECT_TRUE(raw_bit_reader_->Finished());
+  EXPECT_EQ(raw_bit_reader_->ReadBit(), -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RawBitReaderTestInstance, RawBitReaderTest,
+    testing::Combine(testing::Range(1, 5),    // literal size.
+                     testing::Values(100)));  // number of bits/literals.
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h
new file mode 100644
index 0000000..73c32d9
--- /dev/null
+++ b/src/utils/reference_info.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// This struct collects some members related to reference frames in one place to
+// make it easier to pass them as parameters to some dsp functions.
+struct ReferenceInfo {
+  // Initialize |motion_field_reference_frame| so that
+  // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when
+  // the updates are the same as the initialized value.
+  // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify
+  // branch conditions in motion field projection.
+  // The following memory initialization of contiguous memory is very fast. It
+  // is not recommended to make the initialization multi-threaded, unless the
+  // memory which needs to be initialized in each thread is still contiguous.
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) {
+    return motion_field_reference_frame.Reset(rows, columns,
+                                              /*zero_initialize=*/true) &&
+           motion_field_mv.Reset(
+               rows, columns,
+#if LIBGAV1_MSAN
+               // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only
+               // for qualified blocks. In MotionFieldProjectionKernel() dsp
+               // optimizations, it is read no matter it was set or not.
+               /*zero_initialize=*/true
+#else
+               /*zero_initialize=*/false
+#endif
+           );
+  }
+
+  // All members are used by inter frames only.
+  // For intra frames, they are not initialized.
+
+  std::array<uint8_t, kNumReferenceFrameTypes> order_hint;
+
+  // An example when |relative_distance_from| does not equal
+  // -|relative_distance_to|:
+  // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64
+  // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64
+  // This is why we need both |relative_distance_from| and
+  // |relative_distance_to|.
+  // |relative_distance_from|: Relative distances from reference frames to this
+  // frame.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_from;
+  // |relative_distance_to|: Relative distances to reference frames.
+  std::array<int8_t, kNumReferenceFrameTypes> relative_distance_to;
+
+  // Skip motion field projection of specific types of frames if their
+  // |relative_distance_to| is negative or too large.
+  std::array<bool, kNumReferenceFrameTypes> skip_references;
+  // Lookup table to get motion field projection division multiplier of specific
+  // types of frames. Derived from kProjectionMvDivisionLookup.
+  std::array<int16_t, kNumReferenceFrameTypes> projection_divisions;
+
+  // The current frame's |motion_field_reference_frame| and |motion_field_mv_|
+  // are guaranteed to be allocated only when refresh_frame_flags is not 0.
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<ReferenceFrameType> motion_field_reference_frame;
+  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
+  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
+  Array2D<MotionVector> motion_field_mv;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_
diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc
new file mode 100644
index 0000000..75fa776
--- /dev/null
+++ b/src/utils/segmentation.cc
@@ -0,0 +1,31 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+namespace libgav1 {
+
+const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6,
+                                                             6, 3, 0, 0};
+const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = {
+    255,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    kMaxLoopFilterValue,
+    7,
+    0,
+    0};
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h
new file mode 100644
index 0000000..67ff74c
--- /dev/null
+++ b/src/utils/segmentation.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_
+
+#include <cstdint>
+
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax];
+extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_H_
diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc
new file mode 100644
index 0000000..4284ca2
--- /dev/null
+++ b/src/utils/segmentation_map.cc
@@ -0,0 +1,49 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cassert>
+#include <cstring>
+#include <new>
+
+namespace libgav1 {
+
+bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) {
+  rows4x4_ = rows4x4;
+  columns4x4_ = columns4x4;
+  segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]);
+  if (segment_id_buffer_ == nullptr) return false;
+  segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get());
+  return true;
+}
+
+void SegmentationMap::Clear() {
+  memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::CopyFrom(const SegmentationMap& from) {
+  assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_);
+  memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(),
+         rows4x4_ * columns4x4_);
+}
+
+void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4,
+                                int block_height4x4, int8_t segment_id) {
+  for (int y = 0; y < block_height4x4; ++y) {
+    memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4);
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h
new file mode 100644
index 0000000..499be24
--- /dev/null
+++ b/src/utils/segmentation_map.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+
+// SegmentationMap stores the segment id associated with each 4x4 block in the
+// frame.
+class SegmentationMap {
+ public:
+  SegmentationMap() = default;
+
+  // Not copyable or movable
+  SegmentationMap(const SegmentationMap&) = delete;
+  SegmentationMap& operator=(const SegmentationMap&) = delete;
+
+  // Allocates an internal buffer of the given dimensions to hold the
+  // segmentation map. The memory in the buffer is not initialized. Returns
+  // true on success, false on failure (for example, out of memory).
+  LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4);
+
+  int8_t segment_id(int row4x4, int column4x4) const {
+    return segment_id_[row4x4][column4x4];
+  }
+
+  // Sets every element in the segmentation map to 0.
+  void Clear();
+
+  // Copies the entire segmentation map. |from| must be of the same dimensions.
+  void CopyFrom(const SegmentationMap& from);
+
+  // Sets the region of segmentation map covered by the block to |segment_id|.
+  // The block is located at |row4x4|, |column4x4| and has dimensions
+  // |block_width4x4| and |block_height4x4|.
+  void FillBlock(int row4x4, int column4x4, int block_width4x4,
+                 int block_height4x4, int8_t segment_id);
+
+ private:
+  int32_t rows4x4_ = 0;
+  int32_t columns4x4_ = 0;
+
+  // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data
+  // buffer is dynamically allocated and owned by segment_id_buffer_.
+  std::unique_ptr<int8_t[]> segment_id_buffer_;
+  Array2DView<int8_t> segment_id_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_
diff --git a/src/utils/segmentation_map_test.cc b/src/utils/segmentation_map_test.cc
new file mode 100644
index 0000000..4d8a7c9
--- /dev/null
+++ b/src/utils/segmentation_map_test.cc
@@ -0,0 +1,120 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation_map.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(SegmentationMapTest, Clear) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  segmentation_map.Clear();
+  for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 0);
+    }
+  }
+}
+
+TEST(SegmentationMapTest, FillBlock) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  // Fill the whole image with 2.
+  segmentation_map.FillBlock(0, 0, kColumns4x4, kRows4x4, 2);
+  // Fill a block with 1.
+  constexpr int kBlockWidth4x4 = 10;
+  constexpr int kBlockHeight4x4 = 20;
+  segmentation_map.FillBlock(4, 6, kBlockWidth4x4, kBlockHeight4x4, 1);
+  for (int row4x4 = 0; row4x4 < kRows4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kColumns4x4; ++column4x4) {
+      if (4 <= row4x4 && row4x4 < 4 + kBlockHeight4x4 && 6 <= column4x4 &&
+          column4x4 < 6 + kBlockWidth4x4) {
+        // Inside the block.
+        EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+      } else {
+        // Outside the block.
+        EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 2);
+      }
+    }
+  }
+}
+
+TEST(SegmentationMapTest, CopyFrom) {
+  constexpr int32_t kRows4x4 = 60;
+  constexpr int32_t kColumns4x4 = 80;
+  SegmentationMap segmentation_map;
+  ASSERT_TRUE(segmentation_map.Allocate(kRows4x4, kColumns4x4));
+
+  // Split the segmentation map into four blocks of equal size.
+  constexpr int kBlockWidth4x4 = 40;
+  constexpr int kBlockHeight4x4 = 30;
+  segmentation_map.FillBlock(0, 0, kBlockWidth4x4, kBlockHeight4x4, 1);
+  segmentation_map.FillBlock(0, kBlockWidth4x4, kBlockWidth4x4, kBlockHeight4x4,
+                             2);
+  segmentation_map.FillBlock(kBlockHeight4x4, 0, kBlockWidth4x4,
+                             kBlockHeight4x4, 3);
+  segmentation_map.FillBlock(kBlockHeight4x4, kBlockWidth4x4, kBlockWidth4x4,
+                             kBlockHeight4x4, 4);
+
+  SegmentationMap segmentation_map2;
+  ASSERT_TRUE(segmentation_map2.Allocate(kRows4x4, kColumns4x4));
+  segmentation_map2.CopyFrom(segmentation_map);
+
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, column4x4), 1);
+      EXPECT_EQ(segmentation_map2.segment_id(row4x4, column4x4), 1);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(row4x4, kBlockWidth4x4 + column4x4),
+                2);
+      EXPECT_EQ(
+          segmentation_map2.segment_id(row4x4, kBlockWidth4x4 + column4x4), 2);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(
+          segmentation_map.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+      EXPECT_EQ(
+          segmentation_map2.segment_id(kBlockHeight4x4 + row4x4, column4x4), 3);
+    }
+  }
+  for (int row4x4 = 0; row4x4 < kBlockHeight4x4; ++row4x4) {
+    for (int column4x4 = 0; column4x4 < kBlockWidth4x4; ++column4x4) {
+      EXPECT_EQ(segmentation_map.segment_id(kBlockHeight4x4 + row4x4,
+                                            kBlockWidth4x4 + column4x4),
+                4);
+      EXPECT_EQ(segmentation_map2.segment_id(kBlockHeight4x4 + row4x4,
+                                             kBlockWidth4x4 + column4x4),
+                4);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/segmentation_test.cc b/src/utils/segmentation_test.cc
new file mode 100644
index 0000000..e985b2d
--- /dev/null
+++ b/src/utils/segmentation_test.cc
@@ -0,0 +1,40 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/segmentation.h"
+
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "src/utils/common.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+namespace {
+
+int GetUnsignedBits(const unsigned int num_values) {
+  return (num_values > 0) ? FloorLog2(num_values) + 1 : 0;
+}
+
+// Check that kSegmentationFeatureBits and kSegmentationFeatureMaxValues are
+// consistent with each other.
+TEST(SegmentationTest, FeatureBitsAndMaxValuesConsistency) {
+  for (int feature = 0; feature < kSegmentFeatureMax; feature++) {
+    EXPECT_EQ(kSegmentationFeatureBits[feature],
+              GetUnsignedBits(kSegmentationFeatureMaxValues[feature]));
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/stack.h b/src/utils/stack.h
new file mode 100644
index 0000000..39133b9
--- /dev/null
+++ b/src/utils/stack.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_STACK_H_
+#define LIBGAV1_SRC_UTILS_STACK_H_
+
+#include <cassert>
+#include <utility>
+
+namespace libgav1 {
+
+// A LIFO stack of a fixed capacity. The elements are moved using std::move, so
+// the element type T has to be movable.
+//
+// WARNING: No error checking is performed.
+template <typename T, int capacity>
+class Stack {
+ public:
+  // Pushes the element |value| to the top of the stack. It is an error to call
+  // Push() when the stack is full.
+  void Push(T value) {
+    ++top_;
+    assert(top_ < capacity);
+    elements_[top_] = std::move(value);
+  }
+
+  // Returns the element at the top of the stack and removes it from the stack.
+  // It is an error to call Pop() when the stack is empty.
+  T Pop() {
+    assert(top_ >= 0);
+    return std::move(elements_[top_--]);
+  }
+
+  // Returns true if the stack is empty.
+  bool Empty() const { return top_ < 0; }
+
+ private:
+  static_assert(capacity > 0, "");
+  T elements_[capacity];
+  // The array index of the top of the stack. The stack is empty if top_ is -1.
+  int top_ = -1;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_STACK_H_
diff --git a/src/utils/stack_test.cc b/src/utils/stack_test.cc
new file mode 100644
index 0000000..4de2ab6
--- /dev/null
+++ b/src/utils/stack_test.cc
@@ -0,0 +1,74 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/stack.h"
+
+#include <cstdint>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kStackSize = 8;
+
+TEST(StackTest, SimpleType) {
+  Stack<int, kStackSize> stack;
+  EXPECT_TRUE(stack.Empty());
+
+  for (int i = 0; i < kStackSize; ++i) {
+    stack.Push(i);
+    EXPECT_FALSE(stack.Empty());
+  }
+
+  for (int i = kStackSize - 1; i >= 0; --i) {
+    EXPECT_EQ(stack.Pop(), i);
+  }
+  EXPECT_TRUE(stack.Empty());
+}
+
+TEST(StackTest, LargeStruct) {
+  struct LargeMoveOnlyStruct {
+    LargeMoveOnlyStruct() = default;
+    // Move only.
+    LargeMoveOnlyStruct(LargeMoveOnlyStruct&& other) = default;
+    LargeMoveOnlyStruct& operator=(LargeMoveOnlyStruct&& other) = default;
+
+    int32_t array1[1000];
+    uint64_t array2[2000];
+  };
+
+  Stack<LargeMoveOnlyStruct, kStackSize> stack;
+  EXPECT_TRUE(stack.Empty());
+
+  LargeMoveOnlyStruct large_move_only_struct[kStackSize];
+  for (int i = 0; i < kStackSize; ++i) {
+    LargeMoveOnlyStruct& l = large_move_only_struct[i];
+    l.array1[0] = i;
+    l.array2[0] = i;
+    stack.Push(std::move(l));
+    EXPECT_FALSE(stack.Empty());
+  }
+
+  for (int i = kStackSize - 1; i >= 0; --i) {
+    LargeMoveOnlyStruct l = stack.Pop();
+    EXPECT_EQ(l.array1[0], i);
+    EXPECT_EQ(l.array2[0], i);
+  }
+  EXPECT_TRUE(stack.Empty());
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc
new file mode 100644
index 0000000..a3099e1
--- /dev/null
+++ b/src/utils/threadpool.cc
@@ -0,0 +1,327 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#if defined(_MSC_VER)
+#include <process.h>
+#include <windows.h>
+#else  // defined(_MSC_VER)
+#include <pthread.h>
+#endif  // defined(_MSC_VER)
+#if defined(__ANDROID__) || defined(__GLIBC__)
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <utility>
+
+#if defined(__ANDROID__)
+#include <chrono>  // NOLINT (unapproved c++11 header)
+#endif
+
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
+// The glibc wrapper for the gettid() system call was added in glibc 2.30.
+// Emulate it for older versions of glibc.
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else  // Older than glibc 2.30
+#include <sys/syscall.h>
+
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif  // glibc 2.30 or later.
+#endif  // defined(__GLIBC__)
+
+namespace libgav1 {
+
+#if defined(__ANDROID__)
+namespace {
+
+using Clock = std::chrono::steady_clock;
+using Duration = Clock::duration;
+constexpr Duration kBusyWaitDuration =
+    std::chrono::duration_cast<Duration>(std::chrono::duration<double>(2e-3));
+
+}  // namespace
+#endif  // defined(__ANDROID__)
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(int num_threads) {
+  return Create(/*name_prefix=*/"", num_threads);
+}
+
+// static
+std::unique_ptr<ThreadPool> ThreadPool::Create(const char name_prefix[],
+                                               int num_threads) {
+  if (name_prefix == nullptr || num_threads <= 0) return nullptr;
+  std::unique_ptr<WorkerThread*[]> threads(new (std::nothrow)
+                                               WorkerThread*[num_threads]);
+  if (threads == nullptr) return nullptr;
+  std::unique_ptr<ThreadPool> pool(new (std::nothrow) ThreadPool(
+      name_prefix, std::move(threads), num_threads));
+  if (pool != nullptr && !pool->StartWorkers()) {
+    pool = nullptr;
+  }
+  return pool;
+}
+
+ThreadPool::ThreadPool(const char name_prefix[],
+                       std::unique_ptr<WorkerThread*[]> threads,
+                       int num_threads)
+    : threads_(std::move(threads)), num_threads_(num_threads) {
+  threads_[0] = nullptr;
+  assert(name_prefix != nullptr);
+  const size_t name_prefix_len =
+      std::min(strlen(name_prefix), sizeof(name_prefix_) - 1);
+  memcpy(name_prefix_, name_prefix, name_prefix_len);
+  name_prefix_[name_prefix_len] = '\0';
+}
+
+ThreadPool::~ThreadPool() { Shutdown(); }
+
+void ThreadPool::Schedule(std::function<void()> closure) {
+  LockMutex();
+  if (!queue_.GrowIfNeeded()) {
+    // queue_ is full and we can't grow it. Run |closure| directly.
+    UnlockMutex();
+    closure();
+    return;
+  }
+  queue_.Push(std::move(closure));
+  UnlockMutex();
+  SignalOne();
+}
+
+int ThreadPool::num_threads() const { return num_threads_; }
+
+// A simple implementation that mirrors the non-portable Thread.  We may
+// choose to expand this in the future as a portable implementation of
+// Thread, or replace it at such a time as one is implemented.
+class ThreadPool::WorkerThread : public Allocable {
+ public:
+  // Creates and starts a thread that runs pool->WorkerFunction().
+  explicit WorkerThread(ThreadPool* pool);
+
+  // Not copyable or movable.
+  WorkerThread(const WorkerThread&) = delete;
+  WorkerThread& operator=(const WorkerThread&) = delete;
+
+  // REQUIRES: Join() must have been called if Start() was called and
+  // succeeded.
+  ~WorkerThread() = default;
+
+  LIBGAV1_MUST_USE_RESULT bool Start();
+
+  // Joins with the running thread.
+  void Join();
+
+ private:
+#if defined(_MSC_VER)
+  static unsigned int __stdcall ThreadBody(void* arg);
+#else
+  static void* ThreadBody(void* arg);
+#endif
+
+  void SetupName();
+  void Run();
+
+  ThreadPool* pool_;
+#if defined(_MSC_VER)
+  HANDLE handle_;
+#else
+  pthread_t thread_;
+#endif
+};
+
+ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {}
+
+#if defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  // Since our code calls the C run-time library (CRT), use _beginthreadex
+  // rather than CreateThread. Microsoft documentation says "If a thread
+  // created using CreateThread calls the CRT, the CRT may terminate the
+  // process in low-memory conditions."
+  uintptr_t handle = _beginthreadex(
+      /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this,
+      /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr);
+  if (handle == 0) return false;
+  handle_ = reinterpret_cast<HANDLE>(handle);
+  ResumeThread(handle_);
+  return true;
+}
+
+void ThreadPool::WorkerThread::Join() {
+  WaitForSingleObject(handle_, INFINITE);
+  CloseHandle(handle_);
+}
+
+unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return 0;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  // Not currently supported on Windows.
+}
+
+#else  // defined(_MSC_VER)
+
+bool ThreadPool::WorkerThread::Start() {
+  return pthread_create(&thread_, nullptr, ThreadBody, this) == 0;
+}
+
+void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); }
+
+void* ThreadPool::WorkerThread::ThreadBody(void* arg) {
+  auto* thread = static_cast<WorkerThread*>(arg);
+  thread->Run();
+  return nullptr;
+}
+
+void ThreadPool::WorkerThread::SetupName() {
+  if (pool_->name_prefix_[0] != '\0') {
+#if defined(__APPLE__)
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. Also, pthread_mach_thread_np is Apple-specific.
+    // The maximum size of the |name| buffer was noted in the Chromium source
+    // code and was confirmed by experiments.
+    char name[64];
+    mach_port_t id = pthread_mach_thread_np(pthread_self());
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#elif defined(__ANDROID__) || defined(__GLIBC__)
+    // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
+    // with error 34 (ERANGE) on Android.
+    char name[16];
+    pid_t id = GetTid();
+    int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
+                      static_cast<int64_t>(id));
+    assert(rv >= 0);
+    rv = pthread_setname_np(pthread_self(), name);
+    assert(rv == 0);
+    static_cast<void>(rv);
+#endif
+  }
+}
+
+#endif  // defined(_MSC_VER)
+
+void ThreadPool::WorkerThread::Run() {
+  SetupName();
+  pool_->WorkerFunction();
+}
+
+bool ThreadPool::StartWorkers() {
+  if (!queue_.Init()) return false;
+  for (int i = 0; i < num_threads_; ++i) {
+    threads_[i] = new (std::nothrow) WorkerThread(this);
+    if (threads_[i] == nullptr) return false;
+    if (!threads_[i]->Start()) {
+      delete threads_[i];
+      threads_[i] = nullptr;
+      return false;
+    }
+  }
+  return true;
+}
+
+void ThreadPool::WorkerFunction() {
+  LockMutex();
+  while (true) {
+    if (queue_.Empty()) {
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#if defined(__ANDROID__)
+      // On android, if we go to a conditional wait right away, the CPU governor
+      // kicks in and starts shutting the cores down. So we do a very small busy
+      // wait to see if we get our next job within that period. This
+      // significantly improves the performance of common cases of tile parallel
+      // decoding. If we don't receive a job in the busy wait time, we then go
+      // to an actual conditional wait as usual.
+      UnlockMutex();
+      bool found_job = false;
+      const auto wait_start = Clock::now();
+      while (Clock::now() - wait_start < kBusyWaitDuration) {
+        LockMutex();
+        if (!queue_.Empty()) {
+          found_job = true;
+          break;
+        }
+        UnlockMutex();
+      }
+      // If |found_job| is true, we simply continue since we already hold the
+      // mutex and we know for sure that the |queue_| is not empty.
+      if (found_job) continue;
+      // Since |found_job_| was false, the mutex is not being held at this
+      // point.
+      LockMutex();
+      // Ensure that the queue is still empty.
+      if (!queue_.Empty()) continue;
+      if (exit_threads_) {
+        break;  // Queue is empty and exit was requested.
+      }
+#endif  // defined(__ANDROID__)
+      // Queue is still empty, wait for signal or broadcast.
+      Wait();
+    } else {
+      // Take a job from the queue.
+      std::function<void()> job = std::move(queue_.Front());
+      queue_.Pop();
+
+      UnlockMutex();
+      // Note that it is good practice to surround this with a try/catch so
+      // the thread pool doesn't go to hell if the job throws an exception.
+      // This is omitted here because Google3 doesn't like exceptions.
+      std::move(job)();
+      job = nullptr;
+
+      LockMutex();
+    }
+  }
+  UnlockMutex();
+}
+
+void ThreadPool::Shutdown() {
+  // Tell worker threads how to exit.
+  LockMutex();
+  exit_threads_ = true;
+  UnlockMutex();
+  SignalAll();
+
+  // Join all workers. This will block.
+  for (int i = 0; i < num_threads_; ++i) {
+    if (threads_[i] == nullptr) break;
+    threads_[i]->Join();
+    delete threads_[i];
+  }
+}
+
+}  // namespace libgav1
diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h
new file mode 100644
index 0000000..fac875e
--- /dev/null
+++ b/src/utils/threadpool.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_
+#define LIBGAV1_SRC_UTILS_THREADPOOL_H_
+
+#include <functional>
+#include <memory>
+
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
+#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE)
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1
+#else
+#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0
+#endif
+#endif
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
+#include <mutex>               // NOLINT (unapproved c++11 header)
+#else
+// absl::Mutex & absl::CondVar are significantly faster than the pthread
+// variants on platforms other than Android. iOS may deadlock on Shutdown()
+// using absl, see b/142251739.
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#endif
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+#include "src/utils/memory.h"
+#include "src/utils/unbounded_queue.h"
+
+namespace libgav1 {
+
+// An implementation of ThreadPool using POSIX threads (pthreads) or Windows
+// threads.
+//
+// - The pool allocates a fixed number of worker threads on instantiation.
+// - The worker threads will pick up work jobs as they arrive.
+// - If all workers are busy, work jobs are queued for later execution.
+//
+// The thread pool is shut down when the pool is destroyed.
+//
+// Example usage of the thread pool:
+//   {
+//     std::unique_ptr<ThreadPool> pool = ThreadPool::Create(4);
+//     for (int i = 0; i < 100; ++i) {  // Dispatch 100 jobs.
+//       pool->Schedule([&my_data]() { MyFunction(&my_data); });
+//     }
+//   } // ThreadPool gets destroyed only when all jobs are done.
+class ThreadPool : public Executor, public Allocable {
+ public:
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  static std::unique_ptr<ThreadPool> Create(int num_threads);
+
+  // Like the above factory method, but also sets the name prefix for threads.
+  static std::unique_ptr<ThreadPool> Create(const char name_prefix[],
+                                            int num_threads);
+
+  // The destructor will shut down the thread pool and all jobs are executed.
+  // Note that after shutdown, the thread pool does not accept further jobs.
+  ~ThreadPool() override;
+
+  // Adds the specified "closure" to the queue for processing. If worker threads
+  // are available, "closure" will run immediately. Otherwise "closure" is
+  // queued for later execution.
+  //
+  // NOTE: If the internal queue is full and cannot be resized because of an
+  // out-of-memory error, the current thread runs "closure" before returning
+  // from Schedule(). For our use cases, this seems better than the
+  // alternatives:
+  //   1. Return a failure status.
+  //   2. Have the current thread wait until the queue is not full.
+  void Schedule(std::function<void()> closure) override;
+
+  int num_threads() const;
+
+ private:
+  class WorkerThread;
+
+  // Creates the thread pool with the specified number of worker threads.
+  // If num_threads is 1, the closures are run in FIFO order.
+  ThreadPool(const char name_prefix[], std::unique_ptr<WorkerThread*[]> threads,
+             int num_threads);
+
+  // Starts the worker pool.
+  LIBGAV1_MUST_USE_RESULT bool StartWorkers();
+
+  void WorkerFunction();
+
+  // Shuts down the thread pool, i.e. worker threads finish their work and
+  // pick up new jobs until the queue is empty. This call will block until
+  // the shutdown is complete.
+  //
+  // Note: If a worker encounters an empty queue after this call, it will exit.
+  // Other workers might still be running, and if the queue fills up again, the
+  // thread pool will continue to operate with a decreased number of workers.
+  // It is up to the caller to prevent adding new jobs.
+  void Shutdown();
+
+#if LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() { queue_mutex_.lock(); }
+  void UnlockMutex() { queue_mutex_.unlock(); }
+
+  void Wait() {
+    std::unique_lock<std::mutex> queue_lock(queue_mutex_, std::adopt_lock);
+    condition_.wait(queue_lock);
+    queue_lock.release();
+  }
+
+  void SignalOne() { condition_.notify_one(); }
+  void SignalAll() { condition_.notify_all(); }
+
+  std::condition_variable condition_;
+  std::mutex queue_mutex_;
+
+#else  // !LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); }
+  void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); }
+  void Wait() { condition_.Wait(&queue_mutex_); }
+  void SignalOne() { condition_.Signal(); }
+  void SignalAll() { condition_.SignalAll(); }
+
+  absl::CondVar condition_;
+  absl::Mutex queue_mutex_;
+
+#endif  // LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+  UnboundedQueue<std::function<void()>> queue_ LIBGAV1_GUARDED_BY(queue_mutex_);
+  // If not all the worker threads are created, the first entry after the
+  // created worker threads is a null pointer.
+  const std::unique_ptr<WorkerThread*[]> threads_;
+
+  bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false;
+  const int num_threads_ = 0;
+  // name_prefix_ is a C string, whose length is restricted to 16 characters,
+  // including the terminating null byte ('\0'). This restriction comes from
+  // the Linux pthread_setname_np() function.
+  char name_prefix_[16];
+};
+
+}  // namespace libgav1
+
+#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX
+
+#endif  // LIBGAV1_SRC_UTILS_THREADPOOL_H_
diff --git a/src/utils/threadpool_test.cc b/src/utils/threadpool_test.cc
new file mode 100644
index 0000000..17854dc
--- /dev/null
+++ b/src/utils/threadpool_test.cc
@@ -0,0 +1,133 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/threadpool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/executor.h"
+
+namespace libgav1 {
+namespace {
+
+class SimpleGuardedInteger {
+ public:
+  explicit SimpleGuardedInteger(int initial_value) : value_(initial_value) {}
+  SimpleGuardedInteger(const SimpleGuardedInteger&) = delete;
+  SimpleGuardedInteger& operator=(const SimpleGuardedInteger&) = delete;
+
+  void Decrement() {
+    absl::MutexLock l(&mutex_);
+    assert(value_ >= 1);
+    --value_;
+    changed_.SignalAll();
+  }
+
+  void Increment() {
+    absl::MutexLock l(&mutex_);
+    ++value_;
+    changed_.SignalAll();
+  }
+
+  int Value() {
+    absl::MutexLock l(&mutex_);
+    return value_;
+  }
+
+  void WaitForZero() {
+    absl::MutexLock l(&mutex_);
+    while (value_ != 0) {
+      changed_.Wait(&mutex_);
+    }
+  }
+
+ private:
+  absl::Mutex mutex_;
+  absl::CondVar changed_;
+  int value_ LIBGAV1_GUARDED_BY(mutex_);
+};
+
+// Loops for |milliseconds| of wall-clock time.
+void LoopForMs(int64_t milliseconds) {
+  const absl::Time deadline = absl::Now() + absl::Milliseconds(milliseconds);
+  while (absl::Now() < deadline) {
+  }
+}
+
+// A function that increments the given integer.
+void IncrementIntegerJob(SimpleGuardedInteger* value) {
+  LoopForMs(100);
+  value->Increment();
+}
+
+TEST(ThreadPoolTest, ThreadedIntegerIncrement) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  ASSERT_NE(thread_pool, nullptr);
+  EXPECT_EQ(thread_pool->num_threads(), 100);
+  SimpleGuardedInteger count(0);
+  for (int i = 0; i < 1000; ++i) {
+    thread_pool->Schedule([&count]() { IncrementIntegerJob(&count); });
+  }
+  thread_pool.reset(nullptr);
+  EXPECT_EQ(count.Value(), 1000);
+}
+
+// Test a ThreadPool via the Executor interface.
+TEST(ThreadPoolTest, ExecutorInterface) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  ASSERT_NE(thread_pool, nullptr);
+  std::unique_ptr<Executor> executor(thread_pool.release());
+  SimpleGuardedInteger count(0);
+  for (int i = 0; i < 1000; ++i) {
+    executor->Schedule([&count]() { IncrementIntegerJob(&count); });
+  }
+  executor.reset(nullptr);
+  EXPECT_EQ(count.Value(), 1000);
+}
+
+TEST(ThreadPoolTest, DestroyWithoutUse) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(100);
+  EXPECT_NE(thread_pool, nullptr);
+  thread_pool.reset(nullptr);
+}
+
+// If num_threads is 0, ThreadPool::Create() should return a null pointer.
+TEST(ThreadPoolTest, NumThreadsZero) {
+  std::unique_ptr<ThreadPool> thread_pool = ThreadPool::Create(0);
+  EXPECT_EQ(thread_pool, nullptr);
+}
+
+// If num_threads is 1, the closures are run in FIFO order.
+TEST(ThreadPoolTest, OneThreadRunsClosuresFIFO) {
+  int count = 0;  // Declare first so that it outlives the thread pool.
+  std::unique_ptr<ThreadPool> pool = ThreadPool::Create(1);
+  ASSERT_NE(pool, nullptr);
+  EXPECT_EQ(pool->num_threads(), 1);
+  for (int i = 0; i < 1000; ++i) {
+    pool->Schedule([&count, i]() {
+      EXPECT_EQ(count, i);
+      count++;
+    });
+  }
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/types.h b/src/utils/types.h
new file mode 100644
index 0000000..0dd6360
--- /dev/null
+++ b/src/utils/types.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_TYPES_H_
+#define LIBGAV1_SRC_UTILS_TYPES_H_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include "src/utils/array_2d.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+union MotionVector {
+  // Motion vectors will always fit in int16_t and using int16_t here instead
+  // of int saves significant memory since some of the frame sized structures
+  // store motion vectors.
+  // Index 0 is the entry for row (horizontal direction) motion vector.
+  // Index 1 is the entry for column (vertical direction) motion vector.
+  int16_t mv[2];
+  // A uint32_t view into the |mv| array. Useful for cases where both the
+  // motion vectors have to be copied or compared with a single 32 bit
+  // instruction.
+  uint32_t mv32;
+};
+
+union CompoundMotionVector {
+  MotionVector mv[2];
+  // A uint64_t view into the |mv| array. Useful for cases where all the motion
+  // vectors have to be copied or compared with a single 64 bit instruction.
+  uint64_t mv64;
+};
+
+// Stores the motion information used for motion field estimation.
+struct TemporalMotionField : public Allocable {
+  Array2D<MotionVector> mv;
+  Array2D<int8_t> reference_offset;
+};
+
+// MvContexts contains the contexts used to decode portions of an inter block
+// mode info to set the y_mode field in BlockParameters.
+//
+// The contexts in the struct correspond to the ZeroMvContext, RefMvContext,
+// and NewMvContext variables in the spec.
+struct MvContexts {
+  int zero_mv;
+  int reference_mv;
+  int new_mv;
+};
+
+struct PaletteModeInfo {
+  uint8_t size[kNumPlaneTypes];
+  uint16_t color[kMaxPlanes][kMaxPaletteSize];
+};
+
+// Stores the parameters used by the prediction process. The members of the
+// struct are filled in when parsing the bitstream and used when the prediction
+// is computed. The information in this struct is associated with a single
+// block.
+// While both BlockParameters and PredictionParameters store information
+// pertaining to a Block, the only difference is that BlockParameters outlives
+// the block itself (for example, some of the variables in BlockParameters are
+// used to compute the context for reading elements in the subsequent blocks).
+struct PredictionParameters : public Allocable {
+  // Restore the index in the unsorted mv stack from the least 3 bits of sorted
+  // |weight_index_stack|.
+  const MotionVector& reference_mv(int stack_index) const {
+    return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)];
+  }
+  const MotionVector& reference_mv(int stack_index, int mv_index) const {
+    return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]
+        .mv[mv_index];
+  }
+
+  void IncreaseWeight(ptrdiff_t index, int weight) {
+    weight_index_stack[index] += weight << 3;
+  }
+
+  void SetWeightIndexStackEntry(int index, int weight) {
+    weight_index_stack[index] = (weight << 3) + 7 - index;
+  }
+
+  bool use_filter_intra;
+  FilterIntraPredictor filter_intra_mode;
+  int angle_delta[kNumPlaneTypes];
+  int8_t cfl_alpha_u;
+  int8_t cfl_alpha_v;
+  int max_luma_width;
+  int max_luma_height;
+  Array2D<uint8_t> color_index_map[kNumPlaneTypes];
+  bool use_intra_block_copy;
+  InterIntraMode inter_intra_mode;
+  bool is_wedge_inter_intra;
+  int wedge_index;
+  int wedge_sign;
+  bool mask_is_inverse;
+  MotionMode motion_mode;
+  CompoundPredictionType compound_prediction_type;
+  union {
+    // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after
+    // construction. reference_mv() must be called to get the correct element.
+    MotionVector ref_mv_stack[kMaxRefMvStackSize];
+    CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize];
+  };
+  // The least 3 bits of |weight_index_stack| store the index information, and
+  // the other bits store the weight. The index information is actually 7 -
+  // index to make the descending order sort stable (preserves the original
+  // order for elements with the same weight). Sorting an int16_t array is much
+  // faster than sorting a struct array with weight and index stored separately.
+  int16_t weight_index_stack[kMaxRefMvStackSize];
+  // In the spec, the weights of all the nearest mvs are incremented by a bonus
+  // weight which is larger than any natural weight, and later the weights of
+  // the mvs are compared with this bonus weight to determine their contexts. We
+  // replace this procedure by introducing |nearest_mv_count|, which records the
+  // count of the nearest mvs. Since all the nearest mvs are in the beginning of
+  // the mv stack, the index of a mv in the mv stack can be compared with
+  // |nearest_mv_count| to get that mv's context.
+  int nearest_mv_count;
+  int ref_mv_count;
+  int ref_mv_index;
+  MotionVector global_mv[2];
+  int num_warp_samples;
+  int warp_estimate_candidates[kMaxLeastSquaresSamples][4];
+  PaletteModeInfo palette_mode_info;
+  int8_t segment_id;  // segment_id is in the range [0, 7].
+  PredictionMode uv_mode;
+  bool chroma_top_uses_smooth_prediction;
+  bool chroma_left_uses_smooth_prediction;
+};
+
+// A lot of BlockParameters objects are created, so the smallest type is used
+// for each field. The ranges of some fields are documented to justify why
+// their types are large enough.
+struct BlockParameters : public Allocable {
+  BlockSize size;
+  bool skip;
+  bool is_inter;
+  PredictionMode y_mode;
+  TransformSize uv_transform_size;
+  InterpolationFilter interpolation_filter[2];
+  ReferenceFrameType reference_frame[2];
+  // The index of this array is as follows:
+  //  0 - Y plane vertical filtering.
+  //  1 - Y plane horizontal filtering.
+  //  2 - U plane (both directions).
+  //  3 - V plane (both directions).
+  uint8_t deblock_filter_level[kFrameLfCount];
+  CompoundMotionVector mv;
+  // When |Tile::split_parse_and_decode_| is true, each block gets its own
+  // instance of |prediction_parameters|. When it is false, all the blocks point
+  // to |Tile::prediction_parameters_|. This field is valid only as long as the
+  // block is *being* decoded. The lifetime and usage of this field can be
+  // better understood by following its flow in tile.cc.
+  std::unique_ptr<PredictionParameters> prediction_parameters;
+};
+
+// Used to store the left and top block parameters that are used for computing
+// the cdf context of the subsequent blocks.
+struct BlockCdfContext {
+  bool use_predicted_segment_id[32];
+  bool is_explicit_compound_type[32];  // comp_group_idx in the spec.
+  bool is_compound_type_average[32];   // compound_idx in the spec.
+  bool skip_mode[32];
+  uint8_t palette_size[kNumPlaneTypes][32];
+  uint16_t palette_color[32][kNumPlaneTypes][kMaxPaletteSize];
+  PredictionMode uv_mode[32];
+};
+
+// A five dimensional array used to store the wedge masks. The dimensions are:
+//   - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc).
+//   - flip_sign (0 or 1).
+//   - wedge_index (0 to 15).
+//   - each of those three dimensions is a 2d array of block_width by
+//     block_height.
+using WedgeMaskArray =
+    std::array<std::array<std::array<Array2D<uint8_t>, 16>, 2>, 9>;
+
+enum GlobalMotionTransformationType : uint8_t {
+  kGlobalMotionTransformationTypeIdentity,
+  kGlobalMotionTransformationTypeTranslation,
+  kGlobalMotionTransformationTypeRotZoom,
+  kGlobalMotionTransformationTypeAffine,
+  kNumGlobalMotionTransformationTypes
+};
+
+// Global motion and warped motion parameters. See the paper for more info:
+// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally
+// adaptive warped motion compensation in video compression", Proc. IEEE
+// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017.
+struct GlobalMotion {
+  GlobalMotionTransformationType type;
+  int32_t params[6];
+
+  // Represent two shearing operations. Computed from |params| by SetupShear().
+  //
+  // The least significant six (= kWarpParamRoundingBits) bits are all zeros.
+  // (This means alpha, beta, gamma, and delta could be represented by a 10-bit
+  // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum
+  // value is 32704 = 0x7fc0, the largest int16_t value whose least significant
+  // six bits are all zeros.
+  //
+  // Valid warp parameters (as validated by SetupShear()) have smaller ranges.
+  // Their absolute values are less than 2^14 (= 16384). (This follows from
+  // the warpValid check at the end of Section 7.11.3.6.)
+  //
+  // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which
+  // is outside the range of int16_t. When cast to int16_t, 32768 becomes
+  // -32768. This potential int16_t overflow does not matter because either
+  // 32768 or -32768 causes SetupShear() to return false,
+  int16_t alpha;
+  int16_t beta;
+  int16_t gamma;
+  int16_t delta;
+};
+
+// Loop filter parameters:
+//
+// If level[0] and level[1] are both equal to 0, the loop filter process is
+// not invoked.
+//
+// |sharpness| and |delta_enabled| are only used by the loop filter process.
+//
+// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop
+// filter process but also by the reference frame update and loading
+// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only
+// when |delta_enabled| is true.
+struct LoopFilter {
+  // Contains loop filter strength values in the range of [0, 63].
+  std::array<int8_t, kFrameLfCount> level;
+  // Indicates the sharpness level in the range of [0, 7].
+  int8_t sharpness;
+  // Whether the filter level depends on the mode and reference frame used to
+  // predict a block.
+  bool delta_enabled;
+  // Whether additional syntax elements were read that specify which mode and
+  // reference frame deltas are to be updated. loop_filter_delta_update field in
+  // Section 5.9.11 of the spec.
+  bool delta_update;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // reference frame, in the range of [-64, 63].
+  std::array<int8_t, kNumReferenceFrameTypes> ref_deltas;
+  // Contains the adjustment needed for the filter level based on the chosen
+  // mode, in the range of [-64, 63].
+  std::array<int8_t, kLoopFilterMaxModeDeltas> mode_deltas;
+};
+
+struct Delta {
+  bool present;
+  uint8_t scale;
+  bool multi;
+};
+
+struct Cdef {
+  uint8_t damping;  // damping value from the spec + (bitdepth - 8).
+  uint8_t bits;
+  // All the strength values are the values from the spec and left shifted by
+  // (bitdepth - 8).
+  uint8_t y_primary_strength[kMaxCdefStrengths];
+  uint8_t y_secondary_strength[kMaxCdefStrengths];
+  uint8_t uv_primary_strength[kMaxCdefStrengths];
+  uint8_t uv_secondary_strength[kMaxCdefStrengths];
+};
+
+struct TileInfo {
+  bool uniform_spacing;
+  int sb_rows;
+  int sb_columns;
+  int tile_count;
+  int tile_columns_log2;
+  int tile_columns;
+  int tile_column_start[kMaxTileColumns + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_column_width_in_superblocks[kMaxTileColumns + 1];
+  int tile_rows_log2;
+  int tile_rows;
+  int tile_row_start[kMaxTileRows + 1];
+  // This field is not used by libgav1, but is populated for use by some
+  // hardware decoders. So it must not be removed.
+  int tile_row_height_in_superblocks[kMaxTileRows + 1];
+  int16_t context_update_id;
+  uint8_t tile_size_bytes;
+};
+
+struct LoopRestoration {
+  LoopRestorationType type[kMaxPlanes];
+  int unit_size_log2[kMaxPlanes];
+};
+
+// Stores the quantization parameters of Section 5.9.12.
+struct QuantizerParameters {
+  // base_index is in the range [0, 255].
+  uint8_t base_index;
+  int8_t delta_dc[kMaxPlanes];
+  // delta_ac[kPlaneY] is always 0.
+  int8_t delta_ac[kMaxPlanes];
+  bool use_matrix;
+  // The |matrix_level| array is used only when |use_matrix| is true.
+  // matrix_level[plane] specifies the level in the quantizer matrix that
+  // should be used for decoding |plane|. The quantizer matrix has 15 levels,
+  // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If
+  // matrix_level[plane] is 15, the quantizer matrix is not used.
+  int8_t matrix_level[kMaxPlanes];
+};
+
+// The corresponding segment feature constants in the AV1 spec are named
+// SEG_LVL_xxx.
+enum SegmentFeature : uint8_t {
+  kSegmentFeatureQuantizer,
+  kSegmentFeatureLoopFilterYVertical,
+  kSegmentFeatureLoopFilterYHorizontal,
+  kSegmentFeatureLoopFilterU,
+  kSegmentFeatureLoopFilterV,
+  kSegmentFeatureReferenceFrame,
+  kSegmentFeatureSkip,
+  kSegmentFeatureGlobalMv,
+  kSegmentFeatureMax
+};
+
+struct Segmentation {
+  // 5.11.14.
+  // Returns true if the feature is enabled in the segment.
+  bool FeatureActive(int segment_id, SegmentFeature feature) const {
+    return enabled && segment_id < kMaxSegments &&
+           feature_enabled[segment_id][feature];
+  }
+
+  // Returns true if the feature is signed.
+  static bool FeatureSigned(SegmentFeature feature) {
+    // Only the first five segment features are signed, so this comparison
+    // suffices.
+    return feature <= kSegmentFeatureLoopFilterV;
+  }
+
+  bool enabled;
+  bool update_map;
+  bool update_data;
+  bool temporal_update;
+  // True if the segment id will be read before the skip syntax element. False
+  // if the skip syntax element will be read first.
+  bool segment_id_pre_skip;
+  // The highest numbered segment id that has some enabled feature. Used as
+  // the upper bound for decoding segment ids.
+  int8_t last_active_segment_id;
+
+  bool feature_enabled[kMaxSegments][kSegmentFeatureMax];
+  int16_t feature_data[kMaxSegments][kSegmentFeatureMax];
+  bool lossless[kMaxSegments];
+  // Cached values of get_qindex(1, segmentId), to be consumed by
+  // Tile::ReadTransformType(). The values are in the range [0, 255].
+  uint8_t qindex[kMaxSegments];
+};
+
+// Section 6.8.20.
+// Note: In spec, film grain section uses YCbCr to denote variable names,
+// such as num_cb_points, num_cr_points. To keep it consistent with other
+// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc.
+struct FilmGrainParams {
+  bool apply_grain;
+  bool update_grain;
+  bool chroma_scaling_from_luma;
+  bool overlap_flag;
+  bool clip_to_restricted_range;
+
+  uint8_t num_y_points;  // [0, 14].
+  uint8_t num_u_points;  // [0, 10].
+  uint8_t num_v_points;  // [0, 10].
+  // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order.
+  uint8_t point_y_value[14];
+  uint8_t point_y_scaling[14];
+  uint8_t point_u_value[10];
+  uint8_t point_u_scaling[10];
+  uint8_t point_v_value[10];
+  uint8_t point_v_scaling[10];
+
+  uint8_t chroma_scaling;              // [8, 11].
+  uint8_t auto_regression_coeff_lag;   // [0, 3].
+  int8_t auto_regression_coeff_y[24];  // [-128, 127]
+  int8_t auto_regression_coeff_u[25];  // [-128, 127]
+  int8_t auto_regression_coeff_v[25];  // [-128, 127]
+  // Shift value: auto regression coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  uint8_t auto_regression_shift;
+
+  uint16_t grain_seed;
+  int reference_index;
+  int grain_scale_shift;
+  // These multipliers are encoded as nonnegative values by adding 128 first.
+  // The 128 is subtracted during parsing.
+  int8_t u_multiplier;       // [-128, 127]
+  int8_t u_luma_multiplier;  // [-128, 127]
+  // These offsets are encoded as nonnegative values by adding 256 first. The
+  // 256 is subtracted during parsing.
+  int16_t u_offset;          // [-256, 255]
+  int8_t v_multiplier;       // [-128, 127]
+  int8_t v_luma_multiplier;  // [-128, 127]
+  int16_t v_offset;          // [-256, 255]
+};
+
+struct ObuFrameHeader {
+  uint16_t display_frame_id;
+  uint16_t current_frame_id;
+  int64_t frame_offset;
+  uint16_t expected_frame_id[kNumInterReferenceFrameTypes];
+  int32_t width;
+  int32_t height;
+  int32_t columns4x4;
+  int32_t rows4x4;
+  // The render size (render_width and render_height) is a hint to the
+  // application about the desired display size. It has no effect on the
+  // decoding process.
+  int32_t render_width;
+  int32_t render_height;
+  int32_t upscaled_width;
+  LoopRestoration loop_restoration;
+  uint32_t buffer_removal_time[kMaxOperatingPoints];
+  uint32_t frame_presentation_time;
+  // Note: global_motion[0] (for kReferenceFrameIntra) is not used.
+  std::array<GlobalMotion, kNumReferenceFrameTypes> global_motion;
+  TileInfo tile_info;
+  QuantizerParameters quantizer;
+  Segmentation segmentation;
+  bool show_existing_frame;
+  // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is
+  // true.
+  int8_t frame_to_show;
+  FrameType frame_type;
+  bool show_frame;
+  bool showable_frame;
+  bool error_resilient_mode;
+  bool enable_cdf_update;
+  bool frame_size_override_flag;
+  // The order_hint syntax element in the uncompressed header. If
+  // show_existing_frame is false, the OrderHint variable in the spec is equal
+  // to this field, and so this field can be used in place of OrderHint when
+  // show_existing_frame is known to be false, such as during tile decoding.
+  uint8_t order_hint;
+  int8_t primary_reference_frame;
+  bool render_and_frame_size_different;
+  bool use_superres;
+  uint8_t superres_scale_denominator;
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool frame_refs_short_signaling;
+  // A bitmask that specifies which reference frame slots will be updated with
+  // the current frame after it is decoded.
+  uint8_t refresh_frame_flags;
+  static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 ==
+                    kNumReferenceFrameTypes,
+                "");
+  bool found_reference;
+  int8_t force_integer_mv;
+  bool allow_high_precision_mv;
+  InterpolationFilter interpolation_filter;
+  bool is_motion_mode_switchable;
+  bool use_ref_frame_mvs;
+  bool enable_frame_end_update_cdf;
+  // True if all segments are losslessly encoded at the coded resolution.
+  bool coded_lossless;
+  // True if all segments are losslessly encoded at the upscaled resolution.
+  bool upscaled_lossless;
+  TxMode tx_mode;
+  // True means that the mode info for inter blocks contains the syntax
+  // element comp_mode that indicates whether to use single or compound
+  // prediction. False means that all inter blocks will use single prediction.
+  bool reference_mode_select;
+  // The frames to use for compound prediction when skip_mode is true.
+  ReferenceFrameType skip_mode_frame[2];
+  bool skip_mode_present;
+  bool reduced_tx_set;
+  bool allow_warped_motion;
+  Delta delta_q;
+  Delta delta_lf;
+  // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
+  // indicates an invalid value.
+  //
+  // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+  // kFrameIntraOnly), reference_frame_index is not used and may be
+  // uninitialized.
+  int8_t reference_frame_index[kNumInterReferenceFrameTypes];
+  // The ref_order_hint[ i ] syntax element in the uncompressed header.
+  // Specifies the expected output order hint for each reference frame.
+  uint8_t reference_order_hint[kNumReferenceFrameTypes];
+  LoopFilter loop_filter;
+  Cdef cdef;
+  FilmGrainParams film_grain_params;
+};
+
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+  PartitionTreeNode() = default;
+  PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+      : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+  int row4x4 = -1;
+  int column4x4 = -1;
+  BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+  TransformParameters() = default;
+  TransformParameters(TransformType type, int non_zero_coeff_count)
+      : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+  TransformType type;
+  int non_zero_coeff_count;
+};
+
+}  // namespace libgav1
+#endif  // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h
new file mode 100644
index 0000000..fa0d303
--- /dev/null
+++ b/src/utils/unbounded_queue.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
+
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+
+// A FIFO queue of an unbounded capacity.
+//
+// This implementation uses the general approach used in std::deque
+// implementations. See, for example,
+// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl
+//
+// It is much simpler because it just needs to support the queue interface.
+// The blocks are chained into a circular list, not managed by a "map". It
+// does not shrink the internal buffer.
+//
+// An alternative implementation approach is a resizable circular array. See,
+// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/
+// and base::circular_deque in Chromium's base/containers library.
+template <typename T>
+class UnboundedQueue {
+ public:
+  UnboundedQueue() = default;
+
+  // Move only.
+  UnboundedQueue(UnboundedQueue&& other)
+      : first_block_(other.first_block_),
+        front_(other.front_),
+        last_block_(other.last_block_),
+        back_(other.back_) {
+    other.first_block_ = nullptr;
+    other.front_ = 0;
+    other.last_block_ = nullptr;
+    other.back_ = 0;
+  }
+  UnboundedQueue& operator=(UnboundedQueue&& other) {
+    if (this != &other) {
+      Destroy();
+      first_block_ = other.first_block_;
+      front_ = other.front_;
+      last_block_ = other.last_block_;
+      back_ = other.back_;
+      other.first_block_ = nullptr;
+      other.front_ = 0;
+      other.last_block_ = nullptr;
+      other.back_ = 0;
+    }
+    return *this;
+  }
+
+  ~UnboundedQueue() { Destroy(); }
+
+  // Allocates two Blocks upfront because most access patterns require at
+  // least two Blocks. Returns false if the allocation of the Blocks failed.
+  LIBGAV1_MUST_USE_RESULT bool Init() {
+    std::unique_ptr<Block> new_block0(new (std::nothrow) Block);
+    std::unique_ptr<Block> new_block1(new (std::nothrow) Block);
+    if (new_block0 == nullptr || new_block1 == nullptr) return false;
+    first_block_ = last_block_ = new_block0.release();
+    new_block1->next = first_block_;
+    last_block_->next = new_block1.release();
+    return true;
+  }
+
+  // Checks if the queue has room for a new element. If the queue is full,
+  // tries to grow it. Returns false if the queue is full and the attempt to
+  // grow it failed.
+  //
+  // NOTE: GrowIfNeeded() must be called before each call to Push(). This
+  // inconvenient design is necessary to guarantee a successful Push() call.
+  //
+  // Push(T&& value) is often called with the argument std::move(value). The
+  // moved-from object |value| won't be usable afterwards, so it would be
+  // problematic if Push(T&& value) failed and we lost access to the original
+  // |value| object.
+  LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() {
+    assert(last_block_ != nullptr);
+    if (back_ == kBlockCapacity) {
+      if (last_block_->next == first_block_) {
+        // All Blocks are in use.
+        std::unique_ptr<Block> new_block(new (std::nothrow) Block);
+        if (new_block == nullptr) return false;
+        new_block->next = first_block_;
+        last_block_->next = new_block.release();
+      }
+      last_block_ = last_block_->next;
+      back_ = 0;
+    }
+    return true;
+  }
+
+  // Pushes the element |value| to the end of the queue. It is an error to call
+  // Push() when the queue is full.
+  void Push(const T& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(value);
+  }
+
+  void Push(T&& value) {
+    assert(last_block_ != nullptr);
+    assert(back_ < kBlockCapacity);
+    T* elements = reinterpret_cast<T*>(last_block_->buffer);
+    new (&elements[back_++]) T(std::move(value));
+  }
+
+  // Returns the element at the front of the queue. It is an error to call
+  // Front() when the queue is empty.
+  T& Front() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  const T& Front() const {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    return elements[front_];
+  }
+
+  // Removes the element at the front of the queue from the queue. It is an
+  // error to call Pop() when the queue is empty.
+  void Pop() {
+    assert(!Empty());
+    T* elements = reinterpret_cast<T*>(first_block_->buffer);
+    elements[front_++].~T();
+    if (front_ == kBlockCapacity) {
+      // The first block has become empty.
+      front_ = 0;
+      if (first_block_ == last_block_) {
+        // Only one Block is in use. Simply reset back_.
+        back_ = 0;
+      } else {
+        first_block_ = first_block_->next;
+      }
+    }
+  }
+
+  // Returns true if the queue is empty.
+  bool Empty() const { return first_block_ == last_block_ && front_ == back_; }
+
+ private:
+  // kBlockCapacity is the maximum number of elements each Block can hold.
+  // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in
+  // the Block struct.
+  //
+  // In Linux x86_64, sizeof(std::function<void()>) is 32, so each Block can
+  // hold 63 std::function<void()> objects.
+  //
+  // NOTE: The corresponding value in <deque> in libc++ revision
+  // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is:
+  //   template <class _ValueType, class _DiffType>
+  //   struct __deque_block_size {
+  //     static const _DiffType value =
+  //         sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16;
+  //   };
+  //
+  // Note that 4096 / 256 = 16, so apparently this expression is intended to
+  // ensure the block size is at least 4096 bytes and each block can hold at
+  // least 16 elements.
+  static constexpr size_t kBlockCapacity =
+      (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16;
+
+  struct Block : public Allocable {
+    alignas(T) char buffer[kBlockCapacity * sizeof(T)];
+    Block* next;
+  };
+
+  void Destroy() {
+    if (first_block_ == nullptr) return;  // An uninitialized queue.
+
+    // First free the unused blocks, which are located after last_block and
+    // before first_block_.
+    Block* block = last_block_->next;
+    // Cut the circular list open after last_block_.
+    last_block_->next = nullptr;
+    while (block != first_block_) {
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+
+    // Then free the used blocks. Destruct the elements in the used blocks.
+    while (block != nullptr) {
+      const size_t begin = (block == first_block_) ? front_ : 0;
+      const size_t end = (block == last_block_) ? back_ : kBlockCapacity;
+      T* elements = reinterpret_cast<T*>(block->buffer);
+      for (size_t i = begin; i < end; ++i) {
+        elements[i].~T();
+      }
+      Block* next = block->next;
+      delete block;
+      block = next;
+    }
+  }
+
+  // Blocks are chained in a circular singly-linked list. If the list of Blocks
+  // is empty, both first_block_ and last_block_ are null pointers. If the list
+  // is nonempty, first_block_ points to the first used Block and last_block_
+  // points to the last used Block.
+  //
+  // Invariant: If Init() is called and succeeds, the queue is always nonempty.
+  // This allows all methods (except the destructor) to avoid null pointer
+  // checks for first_block_ and last_block_.
+  Block* first_block_ = nullptr;
+  // The index of the element in first_block_ to be removed by Pop().
+  size_t front_ = 0;
+  Block* last_block_ = nullptr;
+  // The index in last_block_ where the new element is inserted by Push().
+  size_t back_ = 0;
+};
+
+#if !LIBGAV1_CXX17
+template <typename T>
+constexpr size_t UnboundedQueue<T>::kBlockCapacity;
+#endif
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_
diff --git a/src/utils/unbounded_queue_test.cc b/src/utils/unbounded_queue_test.cc
new file mode 100644
index 0000000..b107ad0
--- /dev/null
+++ b/src/utils/unbounded_queue_test.cc
@@ -0,0 +1,163 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/unbounded_queue.h"
+
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+class Integer {
+ public:
+  explicit Integer(int value) : value_(new (std::nothrow) int{value}) {}
+
+  // Move only.
+  Integer(Integer&& other) : value_(other.value_) { other.value_ = nullptr; }
+  Integer& operator=(Integer&& other) {
+    if (this != &other) {
+      delete value_;
+      value_ = other.value_;
+      other.value_ = nullptr;
+    }
+    return *this;
+  }
+
+  ~Integer() { delete value_; }
+
+  int value() const { return *value_; }
+
+ private:
+  int* value_;
+};
+
+TEST(UnboundedQueueTest, Basic) {
+  UnboundedQueue<int> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(i);
+    EXPECT_FALSE(queue.Empty());
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front(), i);
+    queue.Pop();
+  }
+  EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, WrapAround) {
+  UnboundedQueue<int> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 1000; ++i) {
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(i);
+    EXPECT_FALSE(queue.Empty());
+    EXPECT_EQ(queue.Front(), i);
+    queue.Pop();
+    EXPECT_TRUE(queue.Empty());
+  }
+}
+
+TEST(UnboundedQueueTest, EmptyBeforeInit) {
+  UnboundedQueue<int> queue;
+  EXPECT_TRUE(queue.Empty());
+}
+
+TEST(UnboundedQueueTest, LotsOfElements) {
+  UnboundedQueue<Integer> queue;
+  ASSERT_TRUE(queue.Init());
+  EXPECT_TRUE(queue.Empty());
+
+  for (int i = 0; i < 10000; ++i) {
+    Integer integer(i);
+    EXPECT_EQ(integer.value(), i);
+    EXPECT_TRUE(queue.GrowIfNeeded());
+    queue.Push(std::move(integer));
+    EXPECT_FALSE(queue.Empty());
+  }
+
+  for (int i = 0; i < 5000; ++i) {
+    EXPECT_FALSE(queue.Empty());
+    const Integer& integer = queue.Front();
+    EXPECT_EQ(integer.value(), i);
+    queue.Pop();
+  }
+  // Leave some elements in the queue to test destroying a nonempty queue.
+  EXPECT_FALSE(queue.Empty());
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(UnboundedQueueTest, Move) {
+  UnboundedQueue<int> ints1;
+  ASSERT_TRUE(ints1.Init());
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(2);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(3);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(5);
+  EXPECT_TRUE(ints1.GrowIfNeeded());
+  ints1.Push(7);
+
+  // Move constructor.
+  UnboundedQueue<int> ints2(std::move(ints1));
+  EXPECT_EQ(ints2.Front(), 2);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 3);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 5);
+  ints2.Pop();
+  EXPECT_EQ(ints2.Front(), 7);
+  ints2.Pop();
+  EXPECT_TRUE(ints2.Empty());
+
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(11);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(13);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(17);
+  EXPECT_TRUE(ints2.GrowIfNeeded());
+  ints2.Push(19);
+
+  // Move assignment.
+  UnboundedQueue<int> ints3;
+  ASSERT_TRUE(ints3.Init());
+  EXPECT_TRUE(ints3.GrowIfNeeded());
+  ints3.Push(23);
+  ints3 = std::move(ints2);
+  EXPECT_EQ(ints3.Front(), 11);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 13);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 17);
+  ints3.Pop();
+  EXPECT_EQ(ints3.Front(), 19);
+  ints3.Pop();
+  EXPECT_TRUE(ints3.Empty());
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/utils/vector.h b/src/utils/vector.h
new file mode 100644
index 0000000..9a21aeb
--- /dev/null
+++ b/src/utils/vector.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// libgav1::Vector implementation
+
+#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_
+#define LIBGAV1_SRC_UTILS_VECTOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "src/utils/compiler_attributes.h"
+
+namespace libgav1 {
+namespace internal {
+
+static constexpr size_t kMinVectorAllocation = 16;
+
+// Returns the smallest power of two greater or equal to 'value'.
+inline size_t NextPow2(size_t value) {
+  if (value == 0) return 0;
+  --value;
+  for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i;
+  return value + 1;
+}
+
+// Returns the smallest capacity greater or equal to 'value'.
+inline size_t NextCapacity(size_t value) {
+  if (value == 0) return 0;
+  if (value <= kMinVectorAllocation) return kMinVectorAllocation;
+  return NextPow2(value);
+}
+
+//------------------------------------------------------------------------------
+// Data structure equivalent to std::vector but returning false and to its last
+// valid state on memory allocation failure.
+// std::vector with a custom allocator does not fill this need without
+// exceptions.
+
+template <typename T>
+class VectorBase {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  VectorBase() noexcept = default;
+  // Move only.
+  VectorBase(const VectorBase&) = delete;
+  VectorBase& operator=(const VectorBase&) = delete;
+  VectorBase(VectorBase&& other) noexcept
+      : items_(other.items_),
+        capacity_(other.capacity_),
+        num_items_(other.num_items_) {
+    other.items_ = nullptr;
+    other.capacity_ = 0;
+    other.num_items_ = 0;
+  }
+  VectorBase& operator=(VectorBase&& other) noexcept {
+    if (this != &other) {
+      clear();
+      free(items_);
+      items_ = other.items_;
+      capacity_ = other.capacity_;
+      num_items_ = other.num_items_;
+      other.items_ = nullptr;
+      other.capacity_ = 0;
+      other.num_items_ = 0;
+    }
+    return *this;
+  }
+  ~VectorBase() {
+    clear();
+    free(items_);
+  }
+
+  // Reallocates just enough memory if needed so that 'new_cap' items can fit.
+  LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) {
+    if (capacity_ < new_cap) {
+      T* const new_items = static_cast<T*>(malloc(new_cap * sizeof(T)));
+      if (new_items == nullptr) return false;
+      if (num_items_ > 0) {
+        if (std::is_trivial<T>::value) {
+          // Cast |new_items| and |items_| to void* to avoid the GCC
+          // -Wclass-memaccess warning and additionally the
+          // bugprone-undefined-memory-manipulation clang-tidy warning. The
+          // memcpy is safe because T is a trivial type.
+          memcpy(static_cast<void*>(new_items),
+                 static_cast<const void*>(items_), num_items_ * sizeof(T));
+        } else {
+          for (size_t i = 0; i < num_items_; ++i) {
+            new (&new_items[i]) T(std::move(items_[i]));
+            items_[i].~T();
+          }
+        }
+      }
+      free(items_);
+      items_ = new_items;
+      capacity_ = new_cap;
+    }
+    return true;
+  }
+
+  // Reallocates less memory so that only the existing items can fit.
+  bool shrink_to_fit() {
+    if (capacity_ == num_items_) return true;
+    if (num_items_ == 0) {
+      free(items_);
+      items_ = nullptr;
+      capacity_ = 0;
+      return true;
+    }
+    const size_t previous_capacity = capacity_;
+    capacity_ = 0;  // Force reserve() to allocate and copy.
+    if (reserve(num_items_)) return true;
+    capacity_ = previous_capacity;
+    return false;
+  }
+
+  // Constructs a new item by copy constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(const T& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by copy constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(const T& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(value);
+    ++num_items_;
+  }
+
+  // Constructs a new item by move constructor. May reallocate if
+  // 'resize_if_needed'.
+  LIBGAV1_MUST_USE_RESULT bool push_back(T&& value,
+                                         bool resize_if_needed = true) {
+    if (num_items_ >= capacity_ &&
+        (!resize_if_needed ||
+         !reserve(internal::NextCapacity(num_items_ + 1)))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+    return true;
+  }
+
+  // Constructs a new item by move constructor. reserve() must have been called
+  // with a sufficient capacity.
+  //
+  // WARNING: No error checking is performed.
+  void push_back_unchecked(T&& value) {
+    assert(num_items_ < capacity_);
+    new (&items_[num_items_]) T(std::move(value));
+    ++num_items_;
+  }
+
+  // Constructs a new item in place by forwarding the arguments args... to the
+  // constructor. May reallocate.
+  template <typename... Args>
+  LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) {
+    if (num_items_ >= capacity_ &&
+        !reserve(internal::NextCapacity(num_items_ + 1))) {
+      return false;
+    }
+    new (&items_[num_items_]) T(std::forward<Args>(args)...);
+    ++num_items_;
+    return true;
+  }
+
+  // Destructs the last item.
+  void pop_back() {
+    --num_items_;
+    items_[num_items_].~T();
+  }
+
+  // Destructs the item at 'pos'.
+  void erase(iterator pos) { erase(pos, pos + 1); }
+
+  // Destructs the items in [first,last).
+  void erase(iterator first, iterator last) {
+    for (iterator it = first; it != last; ++it) it->~T();
+    if (last != end()) {
+      if (std::is_trivial<T>::value) {
+        // Cast |first| and |last| to void* to avoid the GCC
+        // -Wclass-memaccess warning and additionally the
+        // bugprone-undefined-memory-manipulation clang-tidy warning. The
+        // memmove is safe because T is a trivial type.
+        memmove(static_cast<void*>(first), static_cast<const void*>(last),
+                (end() - last) * sizeof(T));
+      } else {
+        for (iterator it_src = last, it_dst = first; it_src != end();
+             ++it_src, ++it_dst) {
+          new (it_dst) T(std::move(*it_src));
+          it_src->~T();
+        }
+      }
+    }
+    num_items_ -= std::distance(first, last);
+  }
+
+  // Destructs all the items.
+  void clear() { erase(begin(), end()); }
+
+  // Destroys (including deallocating) all the items.
+  void reset() {
+    clear();
+    if (!shrink_to_fit()) assert(false);
+  }
+
+  // Accessors
+  bool empty() const { return (num_items_ == 0); }
+  size_t size() const { return num_items_; }
+  size_t capacity() const { return capacity_; }
+
+  T* data() { return items_; }
+  T& front() { return items_[0]; }
+  T& back() { return items_[num_items_ - 1]; }
+  T& operator[](size_t i) { return items_[i]; }
+  T& at(size_t i) { return items_[i]; }
+  const T* data() const { return items_; }
+  const T& front() const { return items_[0]; }
+  const T& back() const { return items_[num_items_ - 1]; }
+  const T& operator[](size_t i) const { return items_[i]; }
+  const T& at(size_t i) const { return items_[i]; }
+
+  iterator begin() { return &items_[0]; }
+  const_iterator begin() const { return &items_[0]; }
+  iterator end() { return &items_[num_items_]; }
+  const_iterator end() const { return &items_[num_items_]; }
+
+  void swap(VectorBase& b) {
+    // Although not necessary here, adding "using std::swap;" and then calling
+    // swap() without namespace qualification is recommended. See Effective
+    // C++, Item 25.
+    using std::swap;
+    swap(items_, b.items_);
+    swap(capacity_, b.capacity_);
+    swap(num_items_, b.num_items_);
+  }
+
+ protected:
+  T* items_ = nullptr;
+  size_t capacity_ = 0;
+  size_t num_items_ = 0;
+};
+
+}  // namespace internal
+
+//------------------------------------------------------------------------------
+
+// Vector class that does *NOT* construct the content on resize().
+// Should be reserved to plain old data.
+template <typename T>
+class VectorNoCtor : public internal::VectorBase<T> {
+ public:
+  // Creates or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      super::num_items_ = new_num_items;
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+// This generic vector class will call the constructors.
+template <typename T>
+class Vector : public internal::VectorBase<T> {
+ public:
+  // Constructs or destructs items so that 'new_num_items' exist.
+  // Allocated memory grows every power-of-two items.
+  LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) {
+    using super = internal::VectorBase<T>;
+    if (super::num_items_ < new_num_items) {
+      if (super::capacity_ < new_num_items) {
+        if (!super::reserve(internal::NextCapacity(new_num_items))) {
+          return false;
+        }
+      }
+      while (super::num_items_ < new_num_items) {
+        new (&super::items_[super::num_items_]) T();
+        ++super::num_items_;
+      }
+    } else {
+      while (super::num_items_ > new_num_items) {
+        --super::num_items_;
+        super::items_[super::num_items_].~T();
+      }
+    }
+    return true;
+  }
+};
+
+//------------------------------------------------------------------------------
+
+// Define non-member swap() functions in the namespace in which VectorNoCtor
+// and Vector are implemented. See Effective C++, Item 25.
+
+template <typename T>
+void swap(VectorNoCtor<T>& a, VectorNoCtor<T>& b) {
+  a.swap(b);
+}
+
+template <typename T>
+void swap(Vector<T>& a, Vector<T>& b) {
+  a.swap(b);
+}
+
+//------------------------------------------------------------------------------
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_UTILS_VECTOR_H_
diff --git a/src/utils/vector_test.cc b/src/utils/vector_test.cc
new file mode 100644
index 0000000..5b0127c
--- /dev/null
+++ b/src/utils/vector_test.cc
@@ -0,0 +1,234 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/vector.h"
+
+#include <memory>
+#include <new>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "src/utils/compiler_attributes.h"
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+class Foo {
+ public:
+  Foo() = default;
+
+  int x() const { return x_; }
+
+ private:
+  int x_ = 38;
+};
+
+class Point {
+ public:
+  Point(int x, int y) : x_(x), y_(y) {}
+
+  int x() const { return x_; }
+  int y() const { return y_; }
+
+ private:
+  int x_;
+  int y_;
+};
+
+TEST(VectorTest, NoCtor) {
+  VectorNoCtor<int> v;
+  EXPECT_TRUE(v.resize(100));
+  Vector<int> w;
+  EXPECT_TRUE(w.resize(100));
+
+#if LIBGAV1_MSAN
+  // Use MemorySanitizer to check VectorNoCtor::resize() does not initialize
+  // the memory while Vector::resize() does.
+  //
+  // __msan_test_shadow(const void *x, uptr size) returns the offset of the
+  // first (at least partially) poisoned byte in the range, or -1 if the whole
+  // range is good.
+  for (size_t i = 0; i < 100; ++i) {
+    EXPECT_EQ(__msan_test_shadow(&v[i], sizeof(int)), 0);
+    EXPECT_EQ(__msan_test_shadow(&w[i], sizeof(int)), -1);
+  }
+#endif
+}
+
+TEST(VectorTest, Constructor) {
+  Vector<Foo> v;
+  EXPECT_TRUE(v.resize(100));
+  for (const Foo& foo : v) {
+    EXPECT_EQ(foo.x(), 38);
+  }
+}
+
+TEST(VectorTest, PushBack) {
+  // Create a vector containing integers
+  Vector<int> v;
+  EXPECT_TRUE(v.reserve(8));
+  EXPECT_EQ(v.size(), 0);
+
+  EXPECT_TRUE(v.push_back(25));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0], 25);
+
+  EXPECT_TRUE(v.push_back(13));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0], 25);
+  EXPECT_EQ(v[1], 13);
+}
+
+TEST(VectorTest, PushBackUnchecked) {
+  Vector<std::unique_ptr<Point>> v;
+  EXPECT_TRUE(v.reserve(2));
+  EXPECT_EQ(v.size(), 0);
+
+  std::unique_ptr<Point> point(new (std::nothrow) Point(1, 2));
+  EXPECT_NE(point, nullptr);
+  v.push_back_unchecked(std::move(point));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0]->x(), 1);
+  EXPECT_EQ(v[0]->y(), 2);
+
+  point.reset(new (std::nothrow) Point(3, 4));
+  EXPECT_NE(point, nullptr);
+  v.push_back_unchecked(std::move(point));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0]->x(), 1);
+  EXPECT_EQ(v[0]->y(), 2);
+  EXPECT_EQ(v[1]->x(), 3);
+  EXPECT_EQ(v[1]->y(), 4);
+}
+
+TEST(VectorTest, EmplaceBack) {
+  Vector<Point> v;
+  EXPECT_EQ(v.size(), 0);
+
+  EXPECT_TRUE(v.emplace_back(1, 2));
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v[0].x(), 1);
+  EXPECT_EQ(v[0].y(), 2);
+
+  EXPECT_TRUE(v.emplace_back(3, 4));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(v[0].x(), 1);
+  EXPECT_EQ(v[0].y(), 2);
+  EXPECT_EQ(v[1].x(), 3);
+  EXPECT_EQ(v[1].y(), 4);
+}
+
+// Copy constructor and assignment are deleted, but move constructor and
+// assignment are OK.
+TEST(VectorTest, Move) {
+  Vector<int> ints1;
+  EXPECT_TRUE(ints1.reserve(4));
+  EXPECT_TRUE(ints1.push_back(2));
+  EXPECT_TRUE(ints1.push_back(3));
+  EXPECT_TRUE(ints1.push_back(5));
+  EXPECT_TRUE(ints1.push_back(7));
+
+  // Move constructor.
+  Vector<int> ints2(std::move(ints1));
+  EXPECT_EQ(ints2.size(), 4);
+  EXPECT_EQ(ints2[0], 2);
+  EXPECT_EQ(ints2[1], 3);
+  EXPECT_EQ(ints2[2], 5);
+  EXPECT_EQ(ints2[3], 7);
+
+  // Move assignment.
+  Vector<int> ints3;
+  EXPECT_TRUE(ints3.reserve(1));
+  EXPECT_TRUE(ints3.push_back(11));
+  ints3 = std::move(ints2);
+  EXPECT_EQ(ints3.size(), 4);
+  EXPECT_EQ(ints3[0], 2);
+  EXPECT_EQ(ints3[1], 3);
+  EXPECT_EQ(ints3[2], 5);
+  EXPECT_EQ(ints3[3], 7);
+}
+
+TEST(VectorTest, Erase) {
+  Vector<int> ints;
+  EXPECT_TRUE(ints.reserve(4));
+  EXPECT_TRUE(ints.push_back(2));
+  EXPECT_TRUE(ints.push_back(3));
+  EXPECT_TRUE(ints.push_back(5));
+  EXPECT_TRUE(ints.push_back(7));
+
+  EXPECT_EQ(ints.size(), 4);
+  EXPECT_EQ(ints[0], 2);
+  EXPECT_EQ(ints[1], 3);
+  EXPECT_EQ(ints[2], 5);
+  EXPECT_EQ(ints[3], 7);
+
+  ints.erase(ints.begin());
+  EXPECT_EQ(ints.size(), 3);
+  EXPECT_EQ(ints[0], 3);
+  EXPECT_EQ(ints[1], 5);
+  EXPECT_EQ(ints[2], 7);
+}
+
+TEST(VectorTest, EraseNonTrivial) {
+  // A simple class that sets an int value to 0 in the destructor.
+  class Cleaner {
+   public:
+    explicit Cleaner(int* value) : value_(value) {}
+    ~Cleaner() { *value_ = 0; }
+
+    int value() const { return *value_; }
+
+   private:
+    int* value_;
+  };
+  int value1 = 100;
+  int value2 = 200;
+  Vector<std::unique_ptr<Cleaner>> v;
+  EXPECT_TRUE(v.reserve(2));
+  EXPECT_EQ(v.capacity(), 2);
+
+  std::unique_ptr<Cleaner> c(new (std::nothrow) Cleaner(&value1));
+  EXPECT_NE(c, nullptr);
+  EXPECT_TRUE(v.push_back(std::move(c)));
+  c.reset(new (std::nothrow) Cleaner(&value2));
+  EXPECT_NE(c, nullptr);
+  EXPECT_TRUE(v.push_back(std::move(c)));
+  EXPECT_EQ(v.size(), 2);
+  EXPECT_EQ(value1, 100);
+  EXPECT_EQ(value2, 200);
+
+  v.erase(v.begin());
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v.capacity(), 2);
+  EXPECT_EQ(value1, 0);
+  EXPECT_EQ(value2, 200);
+  EXPECT_EQ(v[0].get()->value(), value2);
+
+  EXPECT_TRUE(v.shrink_to_fit());
+  EXPECT_EQ(v.size(), 1);
+  EXPECT_EQ(v.capacity(), 1);
+  EXPECT_EQ(value2, 200);
+  EXPECT_EQ(v[0].get()->value(), value2);
+
+  v.clear();
+  EXPECT_TRUE(v.empty());
+  EXPECT_EQ(value2, 0);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/version.cc b/src/version.cc
new file mode 100644
index 0000000..8d1e5a9
--- /dev/null
+++ b/src/version.cc
@@ -0,0 +1,39 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#define LIBGAV1_TOSTRING(x) #x
+#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x)
+#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p
+#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p)
+#define LIBGAV1_DOT_VERSION                                                   \
+  LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \
+                                LIBGAV1_PATCH_VERSION)
+
+#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION)
+
+extern "C" {
+
+int Libgav1GetVersion() { return LIBGAV1_VERSION; }
+const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; }
+
+const char* Libgav1GetBuildConfiguration() {
+  // TODO(jzern): cmake can generate the detail or in other cases we could
+  // produce one based on the known defines along with the defaults based on
+  // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h.
+  return "Not available.";
+}
+
+}  // extern "C"
diff --git a/src/version_test.cc b/src/version_test.cc
new file mode 100644
index 0000000..aaa5e1c
--- /dev/null
+++ b/src/version_test.cc
@@ -0,0 +1,66 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/version.h"
+
+#include <regex>  // NOLINT (unapproved c++11 header)
+
+#include "gtest/gtest.h"
+
+namespace libgav1 {
+namespace {
+
+TEST(VersionTest, GetVersion) {
+  const int library_version = GetVersion();
+  EXPECT_EQ((library_version >> 24) & 0xff, 0);
+  // Note if we link against a shared object there's potential for a mismatch
+  // if a different library is loaded at runtime.
+  EXPECT_EQ((library_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  EXPECT_EQ((library_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  EXPECT_EQ(library_version & 0xff, LIBGAV1_PATCH_VERSION);
+
+  const int header_version = LIBGAV1_VERSION;
+  EXPECT_EQ((header_version >> 24) & 0xff, 0);
+  EXPECT_EQ((header_version >> 16) & 0xff, LIBGAV1_MAJOR_VERSION);
+  EXPECT_EQ((header_version >> 8) & 0xff, LIBGAV1_MINOR_VERSION);
+  EXPECT_EQ(header_version & 0xff, LIBGAV1_PATCH_VERSION);
+}
+
+TEST(VersionTest, GetVersionString) {
+  const char* version = GetVersionString();
+  ASSERT_NE(version, nullptr);
+  // https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string
+  const std::regex semver_regex(
+      R"(^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*))"
+      R"((?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))"
+      R"((?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?)"
+      R"((?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$)");
+
+  EXPECT_TRUE(std::regex_match(version, semver_regex)) << version;
+  // Regex validation:
+  // It shouldn't accept a version starting with a non-digit.
+  version = "v1.2.3";
+  EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+  // It shouldn't accept a version with spaces."
+  version = "1.2.3 alpha";
+  EXPECT_FALSE(std::regex_match(version, semver_regex)) << version;
+}
+
+TEST(VersionTest, GetBuildConfiguration) {
+  const char* config = GetBuildConfiguration();
+  ASSERT_NE(config, nullptr);
+}
+
+}  // namespace
+}  // namespace libgav1
diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc
new file mode 100644
index 0000000..69b40e8
--- /dev/null
+++ b/src/warp_prediction.cc
@@ -0,0 +1,240 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/tile.h"
+#include "src/utils/block_parameters_holder.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int kWarpModelTranslationClamp = 1 << 23;
+constexpr int kWarpModelAffineClamp = 1 << 13;
+constexpr int kLargestMotionVectorDiff = 256;
+
+constexpr uint16_t kDivisorLookup[257] = {
+    16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+    15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+    15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+    14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+    13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+    13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+    13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+    12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+    12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+    11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+    11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+    11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+    10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+    10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+    10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+    9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+    9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+    9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+    9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+    9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+    8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+    8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+    8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+    8240,  8224,  8208,  8192};
+
+// Number of fractional bits of lookup in divisor lookup table.
+constexpr int kDivisorLookupBits = 8;
+// Number of fractional bits of entries in divisor lookup table.
+constexpr int kDivisorLookupPrecisionBits = 14;
+
+// 7.11.3.7.
+template <typename T>
+void GenerateApproximateDivisor(T value, int16_t* division_factor,
+                                int16_t* division_shift) {
+  const int n = FloorLog2(std::abs(value));
+  const T e = std::abs(value) - (static_cast<T>(1) << n);
+  const int entry = (n > kDivisorLookupBits)
+                        ? RightShiftWithRounding(e, n - kDivisorLookupBits)
+                        : static_cast<int>(e << (kDivisorLookupBits - n));
+  *division_shift = n + kDivisorLookupPrecisionBits;
+  *division_factor =
+      (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry];
+}
+
+// 7.11.3.8.
+int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; }
+
+// 7.11.3.8.
+int DiagonalClamp(int32_t value) {
+  return Clip3(value,
+               (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1,
+               (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1);
+}
+
+// 7.11.3.8.
+int NonDiagonalClamp(int32_t value) {
+  return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1);
+}
+
+int16_t GetShearParameter(int value) {
+  return static_cast<int16_t>(
+      LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX),
+                                             kWarpParamRoundingBits),
+                kWarpParamRoundingBits));
+}
+
+}  // namespace
+
+bool SetupShear(GlobalMotion* const warp_params) {
+  int16_t division_shift;
+  int16_t division_factor;
+  const auto* const params = warp_params->params;
+  GenerateApproximateDivisor<int32_t>(params[2], &division_factor,
+                                      &division_shift);
+  const int alpha = params[2] - (1 << kWarpedModelPrecisionBits);
+  const int beta = params[3];
+  const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits);
+  const int gamma =
+      RightShiftWithRoundingSigned(v * division_factor, division_shift);
+  const int64_t w = static_cast<int64_t>(params[3]) * params[4];
+  const int delta =
+      params[5] -
+      RightShiftWithRoundingSigned(w * division_factor, division_shift) -
+      (1 << kWarpedModelPrecisionBits);
+
+  warp_params->alpha = GetShearParameter(alpha);
+  warp_params->beta = GetShearParameter(beta);
+  warp_params->gamma = GetShearParameter(gamma);
+  warp_params->delta = GetShearParameter(delta);
+  if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >=
+       (1 << kWarpedModelPrecisionBits)) ||
+      (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >=
+       (1 << kWarpedModelPrecisionBits))) {
+    return false;  // NOLINT (easier condition to understand).
+  }
+
+  return true;
+}
+
+bool WarpEstimation(const int num_samples, const int block_width4x4,
+                    const int block_height4x4, const int row4x4,
+                    const int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* const warp_params) {
+  // |a| fits into int32_t. To avoid cast to int64_t in the following
+  // computation, we declare |a| as int64_t.
+  int64_t a[2][2] = {};
+  int bx[2] = {};
+  int by[2] = {};
+
+  // Note: for simplicity, the spec always uses absolute coordinates
+  // in the warp estimation process. subpixel_mid_x, subpixel_mid_y,
+  // and candidates are relative to the top left of the frame.
+  // In contrast, libaom uses a mixture of coordinate systems.
+  // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative
+  // to the top left of the block.
+  // mid_y/mid_x: the row/column coordinate of the center of the block.
+  const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1;
+  const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1;
+  const int subpixel_mid_y = MultiplyBy8(mid_y);
+  const int subpixel_mid_x = MultiplyBy8(mid_x);
+  const int reference_subpixel_mid_y = subpixel_mid_y + mv.mv[0];
+  const int reference_subpixel_mid_x = subpixel_mid_x + mv.mv[1];
+
+  for (int i = 0; i < num_samples; ++i) {
+    // candidates[][0] and candidates[][1] are the row/column coordinates of the
+    // sample point in this block, to the top left of the frame.
+    // candidates[][2] and candidates[][3] are the row/column coordinates of the
+    // sample point in this reference block, to the top left of the frame.
+    // sy/sx: the row/column coordinates of the sample point, with center of
+    // the block as origin.
+    const int sy = candidates[i][0] - subpixel_mid_y;
+    const int sx = candidates[i][1] - subpixel_mid_x;
+    // dy/dx: the row/column coordinates of the sample point in the reference
+    // block, with center of the reference block as origin.
+    const int dy = candidates[i][2] - reference_subpixel_mid_y;
+    const int dx = candidates[i][3] - reference_subpixel_mid_x;
+    if (std::abs(sx - dx) < kLargestMotionVectorDiff &&
+        std::abs(sy - dy) < kLargestMotionVectorDiff) {
+      a[0][0] += LeastSquareProduct(sx, sx) + 8;
+      a[0][1] += LeastSquareProduct(sx, sy) + 4;
+      a[1][1] += LeastSquareProduct(sy, sy) + 8;
+      bx[0] += LeastSquareProduct(sx, dx) + 8;
+      bx[1] += LeastSquareProduct(sy, dx) + 4;
+      by[0] += LeastSquareProduct(sx, dy) + 4;
+      by[1] += LeastSquareProduct(sy, dy) + 8;
+    }
+  }
+
+  // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to
+  // compute a[1][0].
+  const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1];
+  if (determinant == 0) return false;
+
+  int16_t division_shift;
+  int16_t division_factor;
+  GenerateApproximateDivisor<int64_t>(determinant, &division_factor,
+                                      &division_shift);
+
+  division_shift -= kWarpedModelPrecisionBits;
+
+  const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1];
+  const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1];
+  const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1];
+  const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1];
+  auto* const params = warp_params->params;
+
+  if (division_shift <= 0) {
+    division_factor <<= -division_shift;
+    params[2] = static_cast<int32_t>(params_2) * division_factor;
+    params[3] = static_cast<int32_t>(params_3) * division_factor;
+    params[4] = static_cast<int32_t>(params_4) * division_factor;
+    params[5] = static_cast<int32_t>(params_5) * division_factor;
+  } else {
+    params[2] = RightShiftWithRoundingSigned(params_2 * division_factor,
+                                             division_shift);
+    params[3] = RightShiftWithRoundingSigned(params_3 * division_factor,
+                                             division_shift);
+    params[4] = RightShiftWithRoundingSigned(params_4 * division_factor,
+                                             division_shift);
+    params[5] = RightShiftWithRoundingSigned(params_5 * division_factor,
+                                             division_shift);
+  }
+
+  params[2] = DiagonalClamp(params[2]);
+  params[3] = NonDiagonalClamp(params[3]);
+  params[4] = NonDiagonalClamp(params[4]);
+  params[5] = DiagonalClamp(params[5]);
+
+  const int vx = mv.mv[1] * (1 << (kWarpedModelPrecisionBits - 3)) -
+                 (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) +
+                  mid_y * params[3]);
+  const int vy = mv.mv[0] * (1 << (kWarpedModelPrecisionBits - 3)) -
+                 (mid_x * params[4] +
+                  mid_y * (params[5] - (1 << kWarpedModelPrecisionBits)));
+  params[0] =
+      Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+  params[1] =
+      Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1);
+
+  params[6] = 0;
+  params[7] = 0;
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/warp_prediction.h b/src/warp_prediction.h
new file mode 100644
index 0000000..6c86df3
--- /dev/null
+++ b/src/warp_prediction.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_
+#define LIBGAV1_SRC_WARP_PREDICTION_H_
+
+#include "src/obu_parser.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+
+namespace libgav1 {
+
+// Sets the alpha, beta, gamma, delta fields in warp_params using the
+// warp_params->params array as input (only array entries at indexes 2, 3, 4,
+// 5 are used). Returns whether alpha, beta, gamma, delta are valid.
+bool SetupShear(GlobalMotion* warp_params);  // 7.11.3.6.
+
+// Computes local warp parameters by performing a least square fit.
+// Returns whether the computed parameters are valid.
+bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4,
+                    int row4x4, int column4x4, const MotionVector& mv,
+                    const int candidates[kMaxLeastSquaresSamples][4],
+                    GlobalMotion* warp_params);  // 7.11.3.8.
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_WARP_PREDICTION_H_
diff --git a/src/warp_prediction_test.cc b/src/warp_prediction_test.cc
new file mode 100644
index 0000000..46f262f
--- /dev/null
+++ b/src/warp_prediction_test.cc
@@ -0,0 +1,246 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/warp_prediction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+
+#include "absl/base/macros.h"
+#include "gtest/gtest.h"
+#include "src/obu_parser.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/types.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+namespace libgav1 {
+namespace {
+
+constexpr int16_t kExpectedWarpParamsOutput[10][4] = {
+    {0, 0, 0, 0},
+    {2880, 2880, 2752, 2752},
+    {-1408, -1408, -1472, -1472},
+    {0, 0, 0, 0},
+    {6784, 6784, 6144, 6144},  // Invalid.
+    {-5312, -5312, -5824, -5824},
+    {-3904, -3904, -4160, -4160},
+    {2496, 2496, 2368, 2368},
+    {1024, 1024, 1024, 1024},
+    {-7808, -7808, -8832, -8832},  // Invalid.
+};
+
+constexpr bool kExpectedWarpValid[10] = {
+    true, true, true, true, false, true, true, true, true, false,
+};
+
+int RandomWarpedParam(int seed_offset, int bits) {
+  libvpx_test::ACMRandom rnd(seed_offset +
+                             libvpx_test::ACMRandom::DeterministicSeed());
+  // 1 in 8 chance of generating zero (arbitrary).
+  const bool zero = (rnd.Rand16() & 7) == 0;
+  if (zero) return 0;
+  // Generate uniform values in the range [-(1 << bits), 1] U [1, 1 << bits].
+  const int mask = (1 << bits) - 1;
+  const int value = 1 + (rnd.RandRange(1U << 31) & mask);
+  const bool sign = (rnd.Rand16() & 1) != 0;
+  return sign ? value : -value;
+}
+
+void GenerateWarpedModel(GlobalMotion* warp_params, int seed) {
+  do {
+    warp_params->params[0] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    warp_params->params[1] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits + 6);
+    warp_params->params[2] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+        (1 << kWarpedModelPrecisionBits);
+    warp_params->params[3] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    warp_params->params[4] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3);
+    warp_params->params[5] =
+        RandomWarpedParam(seed, kWarpedModelPrecisionBits - 3) +
+        (1 << kWarpedModelPrecisionBits);
+  } while (warp_params->params[2] == 0);
+}
+
+TEST(WarpPredictionTest, SetupShear) {
+  for (size_t i = 0; i < ABSL_ARRAYSIZE(kExpectedWarpParamsOutput); ++i) {
+    GlobalMotion warp_params;
+    GenerateWarpedModel(&warp_params, static_cast<int>(i));
+    const bool warp_valid = SetupShear(&warp_params);
+
+    SCOPED_TRACE(testing::Message() << "Test failure at iteration: " << i);
+    EXPECT_EQ(warp_valid, kExpectedWarpValid[i]);
+    EXPECT_EQ(warp_params.alpha, kExpectedWarpParamsOutput[i][0]);
+    EXPECT_EQ(warp_params.beta, kExpectedWarpParamsOutput[i][1]);
+    EXPECT_EQ(warp_params.gamma, kExpectedWarpParamsOutput[i][2]);
+    EXPECT_EQ(warp_params.delta, kExpectedWarpParamsOutput[i][3]);
+  }
+
+  // Test signed shift behavior in delta and gamma generation.
+  GlobalMotion warp_params;
+  warp_params.params[0] = 24748;
+  warp_params.params[1] = -142530;
+  warp_params.params[2] = 65516;
+  warp_params.params[3] = -640;
+  warp_params.params[4] = 256;
+  warp_params.params[5] = 65310;
+  EXPECT_TRUE(SetupShear(&warp_params));
+  EXPECT_EQ(warp_params.alpha, 0);
+  EXPECT_EQ(warp_params.beta, -640);
+  EXPECT_EQ(warp_params.gamma, 256);
+  EXPECT_EQ(warp_params.delta, -192);
+
+  warp_params.params[0] = 24748;
+  warp_params.params[1] = -142530;
+  warp_params.params[2] = 61760;
+  warp_params.params[3] = -640;
+  warp_params.params[4] = -13312;
+  warp_params.params[5] = 65310;
+  EXPECT_TRUE(SetupShear(&warp_params));
+  EXPECT_EQ(warp_params.alpha, -3776);
+  EXPECT_EQ(warp_params.beta, -640);
+  EXPECT_EQ(warp_params.gamma, -14144);
+  EXPECT_EQ(warp_params.delta, -384);
+}
+
+struct WarpInputParam {
+  WarpInputParam(int num_samples, int block_width4x4, int block_height4x4)
+      : num_samples(num_samples),
+        block_width4x4(block_width4x4),
+        block_height4x4(block_height4x4) {}
+  int num_samples;
+  int block_width4x4;
+  int block_height4x4;
+};
+
+std::ostream& operator<<(std::ostream& os, const WarpInputParam& param) {
+  return os << "num_samples: " << param.num_samples
+            << ", block_(width/height)4x4: " << param.block_width4x4 << "x"
+            << param.block_height4x4;
+}
+
+const WarpInputParam warp_test_param[] = {
+    // sample = 1.
+    WarpInputParam(1, 1, 1),
+    WarpInputParam(1, 1, 2),
+    WarpInputParam(1, 2, 1),
+    WarpInputParam(1, 2, 2),
+    WarpInputParam(1, 2, 4),
+    WarpInputParam(1, 4, 2),
+    WarpInputParam(1, 4, 4),
+    WarpInputParam(1, 4, 8),
+    WarpInputParam(1, 8, 4),
+    WarpInputParam(1, 8, 8),
+    WarpInputParam(1, 8, 16),
+    WarpInputParam(1, 16, 8),
+    WarpInputParam(1, 16, 16),
+    WarpInputParam(1, 16, 32),
+    WarpInputParam(1, 32, 16),
+    WarpInputParam(1, 32, 32),
+    // sample = 8.
+    WarpInputParam(8, 1, 1),
+    WarpInputParam(8, 1, 2),
+    WarpInputParam(8, 2, 1),
+    WarpInputParam(8, 2, 2),
+    WarpInputParam(8, 2, 4),
+    WarpInputParam(8, 4, 2),
+    WarpInputParam(8, 4, 4),
+    WarpInputParam(8, 4, 8),
+    WarpInputParam(8, 8, 4),
+    WarpInputParam(8, 8, 8),
+    WarpInputParam(8, 8, 16),
+    WarpInputParam(8, 16, 8),
+    WarpInputParam(8, 16, 16),
+    WarpInputParam(8, 16, 32),
+    WarpInputParam(8, 32, 16),
+    WarpInputParam(8, 32, 32),
+};
+
+constexpr bool kExpectedWarpEstimationValid[2] = {false, true};
+
+constexpr int kExpectedWarpEstimationOutput[16][6] = {
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {8388607, 8388607, 57345, -8191, -8191, 57345},
+    {2146296, 1589240, 57345, 8191, -8191, 73727},
+    {1753128, 1196072, 73727, -8191, 8191, 57345},
+    {-8388608, -8388608, 73727, 8191, 8191, 73727},
+    {-4435485, -8388608, 65260, 8191, 8191, 73727},
+    {-8388608, -7552929, 73727, 8191, 8191, 68240},
+    {-8388608, -8388608, 73727, 8191, 8191, 70800},
+};
+
+class WarpEstimationTest : public testing::TestWithParam<WarpInputParam> {
+ public:
+  WarpEstimationTest() = default;
+  ~WarpEstimationTest() override = default;
+
+ protected:
+  WarpInputParam param_ = GetParam();
+};
+
+TEST_P(WarpEstimationTest, WarpEstimation) {
+  // Set input params.
+  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+  const int row4x4 = rnd.Rand8();
+  const int column4x4 = rnd.Rand8();
+  MotionVector mv;
+  mv.mv[0] = rnd.Rand8();
+  mv.mv[1] = rnd.Rand8();
+  int candidates[kMaxLeastSquaresSamples][4];
+  for (int i = 0; i < param_.num_samples; ++i) {
+    // Make candidates relative to the top left of frame.
+    candidates[i][0] = rnd.Rand8() + MultiplyBy32(row4x4);
+    candidates[i][1] = rnd.Rand8() + MultiplyBy32(column4x4);
+    candidates[i][2] = rnd.Rand8() + MultiplyBy32(row4x4);
+    candidates[i][3] = rnd.Rand8() + MultiplyBy32(column4x4);
+  }
+
+  // Get output.
+  GlobalMotion warp_params;
+  const bool warp_success = WarpEstimation(
+      param_.num_samples, param_.block_width4x4, param_.block_height4x4, row4x4,
+      column4x4, mv, candidates, &warp_params);
+  if (param_.num_samples == 1) {
+    EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[0]);
+  } else {
+    EXPECT_EQ(warp_success, kExpectedWarpEstimationValid[1]);
+    int index = FloorLog2(param_.block_width4x4) * 3 - 1;
+    if (param_.block_width4x4 == param_.block_height4x4) {
+      index += 1;
+    } else if (param_.block_width4x4 < param_.block_height4x4) {
+      index += 2;
+    }
+    for (size_t i = 0; i < ABSL_ARRAYSIZE(warp_params.params); ++i) {
+      EXPECT_EQ(warp_params.params[i], kExpectedWarpEstimationOutput[index][i]);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(WarpFuncTest, WarpEstimationTest,
+                         testing::ValuesIn(warp_test_param));
+}  // namespace
+}  // namespace libgav1
diff --git a/src/yuv_buffer.cc b/src/yuv_buffer.cc
new file mode 100644
index 0000000..efb8016
--- /dev/null
+++ b/src/yuv_buffer.cc
@@ -0,0 +1,256 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/yuv_buffer.h"
+
+#include <cassert>
+#include <cstddef>
+#include <new>
+
+#include "src/frame_buffer_utils.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+
+// Size conventions:
+// * Widths, heights, and border sizes are in pixels.
+// * Strides and plane sizes are in bytes.
+//
+// YuvBuffer objects may be reused through the BufferPool. Realloc() must
+// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may
+// contain stale values from the previous use, and must set all data members
+// from scratch. In particular, Realloc() must not rely on the initial values
+// of data members set by the YuvBuffer constructor.
+bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height,
+                        int8_t subsampling_x, int8_t subsampling_y,
+                        int left_border, int right_border, int top_border,
+                        int bottom_border,
+                        GetFrameBufferCallback get_frame_buffer,
+                        void* callback_private_data,
+                        void** buffer_private_data) {
+  // Only support allocating buffers that have borders that are a multiple of
+  // 2. The border restriction is required because we may subsample the
+  // borders in the chroma planes.
+  if (((left_border | right_border | top_border | bottom_border) & 1) != 0) {
+    LIBGAV1_DLOG(ERROR,
+                 "Borders must be a multiple of 2: left_border = %d, "
+                 "right_border = %d, top_border = %d, bottom_border = %d.",
+                 left_border, right_border, top_border, bottom_border);
+    return false;
+  }
+
+  // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte
+  // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes,
+  // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte
+  // aligned.
+  const int plane_align = kFrameBufferRowAlignment;
+  const int uv_width =
+      is_monochrome ? 0 : SubsampledValue(width, subsampling_x);
+  const int uv_height =
+      is_monochrome ? 0 : SubsampledValue(height, subsampling_y);
+  const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x;
+  const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x;
+  const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y;
+  const int uv_bottom_border =
+      is_monochrome ? 0 : bottom_border >> subsampling_y;
+
+  if (get_frame_buffer != nullptr) {
+    assert(buffer_private_data != nullptr);
+
+    const Libgav1ImageFormat image_format =
+        ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y);
+    FrameBuffer frame_buffer;
+    if (get_frame_buffer(callback_private_data, bitdepth, image_format, width,
+                         height, left_border, right_border, top_border,
+                         bottom_border, kFrameBufferRowAlignment,
+                         &frame_buffer) != kStatusOk) {
+      return false;
+    }
+
+    if (frame_buffer.plane[0] == nullptr ||
+        (!is_monochrome && frame_buffer.plane[1] == nullptr) ||
+        (!is_monochrome && frame_buffer.plane[2] == nullptr)) {
+      assert(false && "The get_frame_buffer callback malfunctioned.");
+      LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned.");
+      return false;
+    }
+
+    stride_[kPlaneY] = frame_buffer.stride[0];
+    stride_[kPlaneU] = frame_buffer.stride[1];
+    stride_[kPlaneV] = frame_buffer.stride[2];
+    buffer_[kPlaneY] = frame_buffer.plane[0];
+    buffer_[kPlaneU] = frame_buffer.plane[1];
+    buffer_[kPlaneV] = frame_buffer.plane[2];
+    *buffer_private_data = frame_buffer.private_data;
+  } else {
+    assert(callback_private_data == nullptr);
+    assert(buffer_private_data == nullptr);
+
+    // Calculate y_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int y_stride = width + left_border + right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) y_stride *= sizeof(uint16_t);
+#endif
+    y_stride = Align(y_stride, kFrameBufferRowAlignment);
+    // Size of the Y plane in bytes.
+    const uint64_t y_plane_size = (height + top_border + bottom_border) *
+                                      static_cast<uint64_t>(y_stride) +
+                                  (plane_align - 1);
+
+    // Calculate uv_stride (in bytes). It is padded to a multiple of
+    // kFrameBufferRowAlignment bytes.
+    int uv_stride = uv_width + uv_left_border + uv_right_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) uv_stride *= sizeof(uint16_t);
+#endif
+    uv_stride = Align(uv_stride, kFrameBufferRowAlignment);
+    // Size of the U or V plane in bytes.
+    const uint64_t uv_plane_size =
+        is_monochrome ? 0
+                      : (uv_height + uv_top_border + uv_bottom_border) *
+                                static_cast<uint64_t>(uv_stride) +
+                            (plane_align - 1);
+
+    // Allocate unaligned y_buffer, u_buffer, and v_buffer.
+    uint8_t* y_buffer = nullptr;
+    uint8_t* u_buffer = nullptr;
+    uint8_t* v_buffer = nullptr;
+
+    const uint64_t frame_size = y_plane_size + 2 * uv_plane_size;
+    if (frame_size > buffer_alloc_size_) {
+      // Allocation to hold larger frame, or first allocation.
+      if (frame_size != static_cast<size_t>(frame_size)) return false;
+
+      buffer_alloc_.reset(new (std::nothrow)
+                              uint8_t[static_cast<size_t>(frame_size)]);
+      if (buffer_alloc_ == nullptr) {
+        buffer_alloc_size_ = 0;
+        return false;
+      }
+
+      buffer_alloc_size_ = static_cast<size_t>(frame_size);
+    }
+
+    y_buffer = buffer_alloc_.get();
+    if (!is_monochrome) {
+      u_buffer = y_buffer + y_plane_size;
+      v_buffer = u_buffer + uv_plane_size;
+    }
+
+    stride_[kPlaneY] = y_stride;
+    stride_[kPlaneU] = stride_[kPlaneV] = uv_stride;
+
+    int left_border_bytes = left_border;
+    int uv_left_border_bytes = uv_left_border;
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    if (bitdepth > 8) {
+      left_border_bytes *= sizeof(uint16_t);
+      uv_left_border_bytes *= sizeof(uint16_t);
+    }
+#endif
+    buffer_[kPlaneY] = AlignAddr(
+        y_buffer + (top_border * y_stride) + left_border_bytes, plane_align);
+    buffer_[kPlaneU] =
+        AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+    buffer_[kPlaneV] =
+        AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes,
+                  plane_align);
+  }
+
+  y_width_ = width;
+  y_height_ = height;
+  left_border_[kPlaneY] = left_border;
+  right_border_[kPlaneY] = right_border;
+  top_border_[kPlaneY] = top_border;
+  bottom_border_[kPlaneY] = bottom_border;
+
+  uv_width_ = uv_width;
+  uv_height_ = uv_height;
+  left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border;
+  right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border;
+  top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border;
+  bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border;
+
+  subsampling_x_ = subsampling_x;
+  subsampling_y_ = subsampling_y;
+
+  bitdepth_ = bitdepth;
+  is_monochrome_ = is_monochrome;
+  assert(!is_monochrome || stride_[kPlaneU] == 0);
+  assert(!is_monochrome || stride_[kPlaneV] == 0);
+  assert(!is_monochrome || buffer_[kPlaneU] == nullptr);
+  assert(!is_monochrome || buffer_[kPlaneV] == nullptr);
+
+#if LIBGAV1_MSAN
+  const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  int width_in_bytes = width * pixel_size;
+  // The optimized loop restoration code will overread the visible frame buffer
+  // into the right border. The optimized cfl subsambler uses the right border
+  // as well. Initialize the right border and padding to prevent msan warnings.
+  int right_border_size_in_bytes = right_border * pixel_size;
+  // Calculate the padding bytes for the buffer. Note: The stride of the buffer
+  // is always a multiple of 16. (see yuv_buffer.h)
+  const int right_padding_in_bytes =
+      stride_[kPlaneY] - (pixel_size * (width + left_border + right_border));
+  const int padded_right_border_size =
+      right_border_size_in_bytes + right_padding_in_bytes;
+  constexpr uint8_t right_val = 0x55;
+  uint8_t* rb = buffer_[kPlaneY] + width_in_bytes;
+  for (int i = 0; i < height + bottom_border; ++i) {
+    memset(rb, right_val, padded_right_border_size);
+    rb += stride_[kPlaneY];
+  }
+  if (!is_monochrome) {
+    int uv_width_in_bytes = uv_width * pixel_size;
+    int uv_right_border_size_in_bytes = uv_right_border * pixel_size;
+    const int u_right_padding_in_bytes =
+        stride_[kPlaneU] -
+        (pixel_size * (uv_width + uv_left_border + uv_right_border));
+    const int u_padded_right_border_size =
+        uv_right_border_size_in_bytes + u_right_padding_in_bytes;
+    rb = buffer_[kPlaneU] + uv_width_in_bytes;
+    for (int i = 0; i < uv_height; ++i) {
+      memset(rb, right_val, u_padded_right_border_size);
+      rb += stride_[kPlaneU];
+    }
+    const int v_right_padding_in_bytes =
+        stride_[kPlaneV] -
+        ((uv_width + uv_left_border + uv_right_border) * pixel_size);
+    const int v_padded_right_border_size =
+        uv_right_border_size_in_bytes + v_right_padding_in_bytes;
+    rb = buffer_[kPlaneV] + uv_width_in_bytes;
+    for (int i = 0; i < uv_height; ++i) {
+      memset(rb, right_val, v_padded_right_border_size);
+      rb += stride_[kPlaneV];
+    }
+  }
+
+  // The optimized cfl subsampler will overread (to the right of the current
+  // block) into the uninitialized visible area. The cfl subsampler can overread
+  // into the bottom border as well. Initialize the both to quiet msan warnings.
+  uint8_t* y_visible = buffer_[kPlaneY];
+  for (int i = 0; i < height + bottom_border; ++i) {
+    memset(y_visible, right_val, width_in_bytes);
+    y_visible += stride_[kPlaneY];
+  }
+#endif
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/src/yuv_buffer.h b/src/yuv_buffer.h
new file mode 100644
index 0000000..b9e8cd3
--- /dev/null
+++ b/src/yuv_buffer.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_YUV_BUFFER_H_
+#define LIBGAV1_SRC_YUV_BUFFER_H_
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "src/gav1/frame_buffer.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+class YuvBuffer {
+ public:
+  // Allocates the buffer. Returns true on success. Returns false on failure.
+  //
+  // * |width| and |height| are the image dimensions in pixels.
+  // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
+  //   subsampling of the width and height of the chroma planes, respectively.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively. The four border sizes must all be a
+  //   multiple of 2.
+  // * If |get_frame_buffer| is not null, it is invoked to allocate the memory.
+  //   If |get_frame_buffer| is null, YuvBuffer allocates the memory directly
+  //   and ignores the |callback_private_data| and |buffer_private_data|
+  //   parameters, which should be null.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
+  //
+  // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The
+  // diagram below shows how Realloc() allocates the data buffer for the Y
+  // plane.
+  //
+  //   16-byte aligned
+  //          |
+  //          v
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++01234567890123456789++pppppppp
+  //        ++11234567890123456789++pppppppp
+  //        ++21234567890123456789++pppppppp
+  //        ++31234567890123456789++pppppppp
+  //        ++41234567890123456789++pppppppp
+  //        ++51234567890123456789++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        ++++++++++++++++++++++++pppppppp
+  //        |                              |
+  //        |<-- stride (multiple of 16) ->|
+  //
+  // The video frame has 6 rows of 20 pixels each. Each row is shown as the
+  // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5.
+  //
+  // Realloc() first adds a border of 2 pixels around the video frame. The
+  // border pixels are shown as '+'.
+  //
+  // Each row is then padded to a multiple of the default alignment in bytes,
+  // which is 16. The padding bytes are shown as lowercase 'p'. (Since
+  // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size
+  // in bytes is the stride. In this example, the stride is 32 bytes.
+  //
+  // Finally, Realloc() aligns the first byte of frame data, which is the '0'
+  // pixel/byte in the upper left corner of the frame, to the default (16-byte)
+  // alignment boundary.
+  //
+  // TODO(wtc): Add a check for width and height limits to defend against
+  // invalid bitstreams.
+  bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
+               int8_t subsampling_x, int8_t subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border,
+               GetFrameBufferCallback get_frame_buffer,
+               void* callback_private_data, void** buffer_private_data);
+
+  int bitdepth() const { return bitdepth_; }
+
+  bool is_monochrome() const { return is_monochrome_; }
+
+  int8_t subsampling_x() const { return subsampling_x_; }
+  int8_t subsampling_y() const { return subsampling_y_; }
+
+  int width(int plane) const {
+    return (plane == kPlaneY) ? y_width_ : uv_width_;
+  }
+  int height(int plane) const {
+    return (plane == kPlaneY) ? y_height_ : uv_height_;
+  }
+
+  // Returns border sizes in pixels.
+  int left_border(int plane) const { return left_border_[plane]; }
+  int right_border(int plane) const { return right_border_[plane]; }
+  int top_border(int plane) const { return top_border_[plane]; }
+  int bottom_border(int plane) const { return bottom_border_[plane]; }
+
+  // Returns the alignment of frame buffer row in bytes.
+  int alignment() const { return kFrameBufferRowAlignment; }
+
+  // Backup the current set of warnings and disable -Warray-bounds for the
+  // following three functions as the compiler cannot, in all cases, determine
+  // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for
+  // loop.
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+  // Returns the data buffer for |plane|.
+  uint8_t* data(int plane) {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+  const uint8_t* data(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(buffer_)>::value);
+    return buffer_[plane];
+  }
+
+  // Returns the stride in bytes for |plane|.
+  int stride(int plane) const {
+    assert(plane >= 0);
+    assert(static_cast<size_t>(plane) < std::extent<decltype(stride_)>::value);
+    return stride_[plane];
+  }
+  // Restore the previous set of compiler warnings.
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+ private:
+  static constexpr int kFrameBufferRowAlignment = 16;
+  int bitdepth_ = 0;
+  bool is_monochrome_ = false;
+
+  // y_width_ and y_height_ are the |width| and |height| arguments passed to the
+  // Realloc() method.
+  //
+  // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as
+  // follows:
+  //   uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_
+  //   uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_
+  int y_width_ = 0;
+  int uv_width_ = 0;
+  int y_height_ = 0;
+  int uv_height_ = 0;
+
+  int left_border_[kMaxPlanes] = {};
+  int right_border_[kMaxPlanes] = {};
+  int top_border_[kMaxPlanes] = {};
+  int bottom_border_[kMaxPlanes] = {};
+
+  int stride_[kMaxPlanes] = {};
+  uint8_t* buffer_[kMaxPlanes] = {};
+
+  // buffer_alloc_ and buffer_alloc_size_ are only used if the
+  // get_frame_buffer callback is null and we allocate the buffer ourselves.
+  std::unique_ptr<uint8_t[]> buffer_alloc_;
+  size_t buffer_alloc_size_ = 0;
+
+  int8_t subsampling_x_ = 0;  // 0 or 1.
+  int8_t subsampling_y_ = 0;  // 0 or 1.
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_YUV_BUFFER_H_
diff --git a/tests/block_utils.cc b/tests/block_utils.cc
new file mode 100644
index 0000000..07337c4
--- /dev/null
+++ b/tests/block_utils.cc
@@ -0,0 +1,134 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/block_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+#define LIBGAV1_DEBUG_FORMAT_CODE "x"
+template <typename Pixel>
+void PrintBlockDiff(const Pixel* block1, const Pixel* block2, int width,
+                    int height, int stride1, int stride2,
+                    const bool print_padding) {
+  const int print_width = print_padding ? std::min(stride1, stride2) : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        if (block1[x] == block2[x]) {
+          printf("[%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width, block1[x]);
+        } else {
+          printf("[*%*" LIBGAV1_DEBUG_FORMAT_CODE "] ", field_width - 1,
+                 block1[x]);
+        }
+      } else {
+        if (block1[x] == block2[x]) {
+          printf("%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width, block1[x]);
+        } else {
+          printf("*%*" LIBGAV1_DEBUG_FORMAT_CODE " ", field_width - 1,
+                 block1[x]);
+        }
+      }
+    }
+    printf("\n");
+    block1 += stride1;
+    block2 += stride2;
+  }
+}
+#undef LIBGAV1_DEBUG_FORMAT_CODE
+
+}  // namespace
+
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                const bool print_padding /*= false*/) {
+  const int print_width = print_padding ? stride : width;
+  const int field_width = (sizeof(Pixel) == 1) ? 4 : 5;
+  for (int y = 0; y < height; ++y) {
+    printf("[%2d] ", y);
+    for (int x = 0; x < print_width; ++x) {
+      if (x >= width) {
+        printf("[%*d] ", field_width, block[x]);
+      } else {
+        printf("%*d ", field_width, block[x]);
+      }
+    }
+    printf("\n");
+    block += stride;
+  }
+}
+
+template void PrintBlock(const uint8_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const uint16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+template void PrintBlock(const int8_t* block, int width, int height, int stride,
+                         bool print_padding /*= false*/);
+template void PrintBlock(const int16_t* block, int width, int height,
+                         int stride, bool print_padding /*= false*/);
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2,
+                   const bool check_padding, const bool print_diff /*= true*/) {
+  bool ok = true;
+  const int check_width = check_padding ? std::min(stride1, stride2) : width;
+  for (int y = 0; y < height; ++y) {
+    const uint64_t row1 = static_cast<uint64_t>(y) * stride1;
+    const uint64_t row2 = static_cast<uint64_t>(y) * stride2;
+    ok = memcmp(block1 + row1, block2 + row2,
+                sizeof(block1[0]) * check_width) == 0;
+    if (!ok) break;
+  }
+  if (!ok && print_diff) {
+    printf("block1 (width: %d height: %d stride: %d):\n", width, height,
+           stride1);
+    PrintBlockDiff(block1, block2, width, height, stride1, stride2,
+                   check_padding);
+    printf("\nblock2 (width: %d height: %d stride: %d):\n", width, height,
+           stride2);
+    PrintBlockDiff(block2, block1, width, height, stride2, stride1,
+                   check_padding);
+  }
+  return ok;
+}
+
+template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const uint16_t* block1, const uint16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int8_t* block1, const int8_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+template bool CompareBlocks(const int16_t* block1, const int16_t* block2,
+                            int width, int height, int stride1, int stride2,
+                            const bool check_padding,
+                            const bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/block_utils.h b/tests/block_utils.h
new file mode 100644
index 0000000..4542420
--- /dev/null
+++ b/tests/block_utils.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_BLOCK_UTILS_H_
+#define LIBGAV1_TESTS_BLOCK_UTILS_H_
+
+#include <cstdint>
+
+namespace libgav1 {
+namespace test_utils {
+
+//------------------------------------------------------------------------------
+// Prints |block| pixel by pixel with |width| pixels per row if |print_padding|
+// is false, |stride| otherwise. If |print_padding| is true padding pixels are
+// surrounded in '[]'.
+template <typename Pixel>
+void PrintBlock(const Pixel* block, int width, int height, int stride,
+                bool print_padding = false);
+
+extern template void PrintBlock(const uint8_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+extern template void PrintBlock(const uint16_t* block, int width, int height,
+                                int stride, bool print_padding /*= false*/);
+
+//------------------------------------------------------------------------------
+// Compares |block1| and |block2| pixel by pixel checking |width| pixels per row
+// if |check_padding| is false, min(|stride1|, |stride2|) pixels otherwise.
+// Prints the blocks with differences marked with a '*' if |print_diff| is
+// true (the default).
+
+template <typename Pixel>
+bool CompareBlocks(const Pixel* block1, const Pixel* block2, int width,
+                   int height, int stride1, int stride2, bool check_padding,
+                   bool print_diff = true);
+
+extern template bool CompareBlocks(const uint8_t* block1, const uint8_t* block2,
+                                   int width, int height, int stride1,
+                                   int stride2, bool check_padding,
+                                   bool print_diff /*= true*/);
+extern template bool CompareBlocks(const uint16_t* block1,
+                                   const uint16_t* block2, int width,
+                                   int height, int stride1, int stride2,
+                                   bool check_padding,
+                                   bool print_diff /*= true*/);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_BLOCK_UTILS_H_
diff --git a/tests/data/five-frames.ivf b/tests/data/five-frames.ivf
new file mode 100644
index 0000000000000000000000000000000000000000..08bc6dbaceb9f46ab5715e87c934322f6fde9e10
GIT binary patch
literal 883
zcmZ?q_H<)lP+)KjGcZhGRA7{2U|?VbVpbpq0V^g54I+gYxVV87i))=-?Vke*Mk`nZ
z80SA=U^u{cp+ReX!S^PHueQzIg(t*1iYxD}_v3sXakuDq`{$1r?-bkm8ckI_uzXU2
z&=Z|S?|)=Bt(~CqYr&(~$Y+X5-yTZwbw>-Ziv*NCz5YQp`Bc{HyiXEAVV<{k9d0=E
zb~Sg1y{|g&yTa`jAD<lKSDvqXuP^0+%Y}Kfx~_J8e(=Ga>CX~|Lv4>ucc=VPReG=9
z`#`bDl(XI3WgFv`**ku>Fm+w&Jp6RZn`41yh0|WIwfvT|@mNOeRQrifbiDJmIap8J
z58*oAHT!|U`<I=u->x*Tnr?G&RZH07Zhhg!r<byeOkh6#_A$$3TQ2p!x26xKDCJ*S
zt@16$G1RN}OpEpkWpQ3E|F`y~e>$ghmx`=Q-aW%vg!LA4)ofqKPue1--FKYtD;y2A
z{qnr|Txp=fv$q$z*_Q}Zu38*n>aDGOW&QgCi*kJ}-Hd4=mrHUgoj+v?B>p}(A;rJO
z(OKitc5$IE-ASpU(;qSNsH~VdQ-CvUg~-f(U*m#OYxXVu@IXW|Q|6P<Z)qK2--FU=
zH&%M9-dOzN=uYE%5!&Bpe>eHQF+hE?WAnZn_h<a~JAbD}m$9m8KXc>0TTJI~eLerT
z+5O(0_l1ksus_?%XlA|fM#(DIr7<$?zMj8AW~bipK3Zw9H8A7t-_-|JUqASNeRf*@
z=W9Ve)2i1C{d)cC*%$4}fuBsLz2`e+{VTuvSHeNYRX68J8vEUy6*FPwjh%nELpG^$
z=Dj$-?8e^(;+;WSz+?kTRUjb5V5DZi#K^E*j~$eFHh@y!jQ@+5v)nj)(<06Fz(wV?
z&)-g&=9s<C2&fvC{D8Xj3|JUC#G$$v4*+%jC|{-h@#h7BGy2!Ax@U?Bt~UL{Vwf+t
zbHS1o(=tqex|x9(1c15?4A>Z4*`d0b9TXU5%I*{Nw!NkFjcaksB3-FNrq@m60~T&_
zR1!a}vP<mQdtabF7O-xxeoq6=!!S1=;1LesU^usB`?n=uc1g*ZZk0E@`uIW)`-~p;
piPPSm`g-D-^)~y-3u@ysW3_9Yx?H+;t=@dBSnSq*flT3cNdS&ed(r>^

literal 0
HcmV?d00001

diff --git a/tests/data/ivf-header-and-truncated-frame-header b/tests/data/ivf-header-and-truncated-frame-header
new file mode 100644
index 0000000000000000000000000000000000000000..c6d7a6a102b09fe7d9704f02f10cf928f52aaa4a
GIT binary patch
literal 36
kcmZ?q_H<)lP+)KjGcZhGRA7{2U|?W`Vj#s}!o<J;08@Gc4*&oF

literal 0
HcmV?d00001

diff --git a/tests/data/ivf-header-only b/tests/data/ivf-header-only
new file mode 100644
index 0000000000000000000000000000000000000000..e751f363d58916e8737af8ebd12ad4b02c1c7108
GIT binary patch
literal 32
gcmZ?q_H<)lP+)KjGcZhGRA7{2U|?W`Vj#r;07pOq*#H0l

literal 0
HcmV?d00001

diff --git a/tests/data/ivf-signature-only b/tests/data/ivf-signature-only
new file mode 100644
index 0000000..8550ef8
--- /dev/null
+++ b/tests/data/ivf-signature-only
@@ -0,0 +1 @@
+DKIF
diff --git a/tests/data/one-frame-large-timestamp.ivf b/tests/data/one-frame-large-timestamp.ivf
new file mode 100644
index 0000000000000000000000000000000000000000..44886da30e7682a620c3c4d0b38608aba4541333
GIT binary patch
literal 608
zcmV-m0-yavOG!om03ZNCRxmMO0U!Y$0000100001000000000q0ssI2000010000I
z015{fA}jIsz#uZF1QYxK02l$<5Cv849`Hc*J%fvJ$QFomdET#13Flbda{Gt#@zLCK
zJxwx|CBUbWU=rjkqVM=?f~}Aw`k>-nS>_=k_Td^1i(3!}6i{~M*YG7{%4+Lw@)%N9
zN!q%@fWqyo3sgT%Ck^g#w>a_S#t$Q(E#8h~;6%`#nu^tm^WgAD0{o-^!iM8DyJY$$
zBJU@S;30xF35PdCwgI%8xch_xiqwh2<&^BkQ8saw>#aHVY_Y~@U6ntP<Sa{XD+mS1
z-&6|6iksjN@9BwM_SA!_mps9$gjS=AFB7B7r3Vy{1IO*-1d}}qCywnk;FKb7)T<=+
zY(Z5@eawU_s3R8*3Qz4nb^M8xi**#PW4o9`6b0G?dYer_@+%Z|i`+xsAjMTZ^yh=l
zbx|PZ?a+${q!4+kqgXXdD<jme?{GMGFDfl)mQ>SpY<WZSX%J!i&X8nJd_hAf(zh29
z^owF;6_?@x3?!(TnGgw9s1%vL^<GkCe7>dd;1n5Y9P$$T8!Qt|!5e1SsY@l;qv*xC
zGu~J$_nY@L_pwkXlR<;N*x#7{PS4zYEdhFhzXO53+5*qo_0Ro-N8Y^eaigpU=CuJf
zJF(bwszs$-9EVLw`c#``*h|HEIJHq|?ft93tJlH*uWM#+^VU*KmV2)f`s?cE^edB5
u@->$44$3?FZ+rS+!2zn-o*6Sv-I`pGso1&v3skZt32x}mrr7<U7l~3XHXvRA

literal 0
HcmV?d00001

diff --git a/tests/data/one-frame-truncated.ivf b/tests/data/one-frame-truncated.ivf
new file mode 100644
index 0000000000000000000000000000000000000000..94e5b09d01bd07c00db63696ebdcb19604321613
GIT binary patch
literal 100
zcmZ?q_H<)lP+)KjGcZhGRA7{2U|?W`Vj#s}!UQ5AK!|~hT|!Cw<F^9}M$1@)|1dB}
rFy0bi4Sgr~!QrcIb9dnhv5w-(d+Yr;pGVv+`rZEd<HbA0w!TIHV1pZ(

literal 0
HcmV?d00001

diff --git a/tests/data/one-frame.ivf b/tests/data/one-frame.ivf
new file mode 100644
index 0000000000000000000000000000000000000000..436e4618fba8f7f6b7fa4ac56b1d7d9f1e0aea15
GIT binary patch
literal 608
zcmV-m0-yavOG!om03ZNCRxmMO0U!Y$0000100001000000000q0ssI2000000000I
z015{fA}jIsz#uZF1QYxK02l$<5Cv849`Hc*J%fvJ$QFomdET#13Flbda{Gt#@zLCK
zJxwx|CBUbWU=rjkqVM=?f~}Aw`k>-nS>_=k_Td^1i(3!}6i{~M*YG7{%4+Lw@)%N9
zN!q%@fWqyo3sgT%Ck^g#w>a_S#t$Q(E#8h~;6%`#nu^tm^WgAD0{o-^!iM8DyJY$$
zBJU@S;30xF35PdCwgI%8xch_xiqwh2<&^BkQ8saw>#aHVY_Y~@U6ntP<Sa{XD+mS1
z-&6|6iksjN@9BwM_SA!_mps9$gjS=AFB7B7r3Vy{1IO*-1d}}qCywnk;FKb7)T<=+
zY(Z5@eawU_s3R8*3Qz4nb^M8xi**#PW4o9`6b0G?dYer_@+%Z|i`+xsAjMTZ^yh=l
zbx|PZ?a+${q!4+kqgXXdD<jme?{GMGFDfl)mQ>SpY<WZSX%J!i&X8nJd_hAf(zh29
z^owF;6_?@x3?!(TnGgw9s1%vL^<GkCe7>dd;1n5Y9P$$T8!Qt|!5e1SsY@l;qv*xC
zGu~J$_nY@L_pwkXlR<;N*x#7{PS4zYEdhFhzXO53+5*qo_0Ro-N8Y^eaigpU=CuJf
zJF(bwszs$-9EVLw`c#``*h|HEIJHq|?ft93tJlH*uWM#+^VU*KmV2)f`s?cE^edB5
u@->$44$3?FZ+rS+!2zn-o*6Sv-I`pGso1&v3skZt32x}mrr7<U7l~3U{vcfd

literal 0
HcmV?d00001

diff --git a/tests/fuzzer/decoder_fuzzer.cc b/tests/fuzzer/decoder_fuzzer.cc
new file mode 100644
index 0000000..236fd3c
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer.cc
@@ -0,0 +1,87 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+void Decode(const uint8_t* const data, const size_t size,
+            libgav1::Decoder* const decoder) {
+  decoder->EnqueueFrame(data, size, /*user_private_data=*/0,
+                        /*buffer_private_data=*/nullptr);
+  const libgav1::DecoderBuffer* buffer;
+  decoder->DequeueFrame(&buffer);
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the low byte of the width to seed the number of threads.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  // Treat the input as a raw OBU stream.
+  Decode(data, size, &decoder);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int decoded_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    Decode(buffer.data(), buffer.size(), &decoder);
+    if (++decoded_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
diff --git a/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
new file mode 100644
index 0000000..d1b1c54
--- /dev/null
+++ b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
@@ -0,0 +1,139 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/gav1/decoder.h"
+#include "src/gav1/status_code.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+using InputBuffer = std::vector<uint8_t>;
+
+struct InputBuffers {
+  ~InputBuffers() {
+    for (auto& buffer : free_buffers) {
+      delete buffer;
+    }
+  }
+  std::deque<InputBuffer*> free_buffers;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const test = static_cast<InputBuffers*>(callback_private_data);
+  test->free_buffers.push_back(static_cast<InputBuffer*>(buffer_private_data));
+}
+
+}  // namespace
+
+// Always returns 0. Nonzero return values are reserved by libFuzzer for future
+// use.
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Note that |input_buffers| has to outlive the |decoder| object since the
+  // |release_input_buffer| callback could be called on the |decoder|'s
+  // destructor.
+  InputBuffers input_buffers;
+
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings = {};
+  // Use the 33 + low byte of the width to seed the number of threads. This
+  // ensures that we will trigger the frame parallel path in most cases.
+  // We use both nibbles of the lower byte as this results in values != 1 much
+  // more quickly than using the lower nibble alone.
+  settings.threads =
+      33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1);
+
+  settings.frame_parallel = true;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+  if (decoder.Init(&settings) != libgav1::kStatusOk) return 0;
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  InputBuffer* input_buffer = nullptr;
+  bool dequeue_finished = false;
+
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile()) {
+      if (input_buffers.free_buffers.empty()) {
+        auto* const buffer = new (std::nothrow) InputBuffer();
+        if (buffer == nullptr) {
+          break;
+        }
+        input_buffers.free_buffers.push_back(buffer);
+      }
+      input_buffer = input_buffers.free_buffers.front();
+      input_buffers.free_buffers.pop_front();
+      if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) {
+        break;
+      }
+    }
+
+    if (input_buffer != nullptr) {
+      libgav1::StatusCode status =
+          decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                               /*user_private_data=*/0,
+                               /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        break;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+    } else if (status == libgav1::kStatusOk) {
+      dequeue_finished = false;
+    } else {
+      break;
+    }
+  } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
+           !dequeue_finished);
+
+  if (input_buffer != nullptr) {
+    input_buffers.free_buffers.push_back(input_buffer);
+  }
+
+  return 0;
+}
diff --git a/tests/fuzzer/fuzzer_temp_file.h b/tests/fuzzer/fuzzer_temp_file.h
new file mode 100644
index 0000000..5d12bbe
--- /dev/null
+++ b/tests/fuzzer/fuzzer_temp_file.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2020 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
+
+// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that
+// require a file instead of an input buffer.
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+// Pure-C interface for creating and cleaning up temporary files.
+
+static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size,
+                                            const char* suffix) {
+  if (suffix == NULL) {  // NOLINT (this could be a C compilation unit)
+    suffix = "";
+  }
+  const size_t suffix_len = strlen(suffix);
+  if (suffix_len > INT_MAX) {  // mkstemps takes int for suffixlen param
+    perror("Suffix too long");
+    abort();
+  }
+
+#ifdef __ANDROID__
+  const char* leading_temp_path =
+      "/data/local/tmp/generate_temporary_file.XXXXXX";
+#else
+  const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX";
+#endif
+  const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1;
+  char* filename_buffer =
+      (char*)malloc(buffer_sz);  // NOLINT (this could be a C compilation unit)
+  if (!filename_buffer) {
+    perror("Failed to allocate file name buffer.");
+    abort();
+  }
+
+  if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >=
+      buffer_sz) {
+    perror("File name buffer too short.");
+    abort();
+  }
+
+  const int file_descriptor = mkstemps(filename_buffer, suffix_len);
+  if (file_descriptor < 0) {
+    perror("Failed to make temporary file.");
+    abort();
+  }
+  FILE* file = fdopen(file_descriptor, "wb");
+  if (!file) {
+    perror("Failed to open file descriptor.");
+    close(file_descriptor);
+    abort();
+  }
+  const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file);
+  if (bytes_written < size) {
+    close(file_descriptor);
+    fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)",
+            bytes_written, size);
+    abort();
+  }
+  fclose(file);
+  return filename_buffer;
+}
+
+static char* fuzzer_get_tmpfile(
+    const uint8_t* data,
+    size_t size) {  // NOLINT (people include this .inc file directly)
+  return fuzzer_get_tmpfile_with_suffix(data, size, NULL);  // NOLINT
+}
+
+static void fuzzer_release_tmpfile(char* filename) {
+  if (unlink(filename) != 0) {
+    perror("WARNING: Failed to delete temporary file.");
+  }
+  free(filename);
+}
+
+// C++ RAII object for creating temporary files.
+
+#ifdef __cplusplus
+class FuzzerTemporaryFile {
+ public:
+  FuzzerTemporaryFile(const uint8_t* data, size_t size)
+      : original_filename_(fuzzer_get_tmpfile(data, size)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix)
+      : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) {
+    filename_ = strdup(original_filename_);
+    if (!filename_) {
+      perror("Failed to allocate file name copy.");
+      abort();
+    }
+  }
+
+  ~FuzzerTemporaryFile() {
+    free(filename_);
+    fuzzer_release_tmpfile(original_filename_);
+  }
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete;
+
+  FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete;
+  FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete;
+
+  const char* filename() const { return filename_; }
+
+  // Returns a mutable pointer to the file name. Should be used sparingly, only
+  // in case the fuzzed API demands it or when making a mutable copy is
+  // inconvenient (e.g., in auto-generated code).
+  char* mutable_filename() const { return filename_; }
+
+ private:
+  char* original_filename_;
+
+  // A mutable copy of the original filename, returned by the accessor. This
+  // guarantees that the original filename can always be used to release the
+  // temporary path.
+  char* filename_;
+};
+#endif  // __cplusplus
+#endif  // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_
diff --git a/tests/fuzzer/obu_parser_fuzzer.cc b/tests/fuzzer/obu_parser_fuzzer.cc
new file mode 100644
index 0000000..634a802
--- /dev/null
+++ b/tests/fuzzer/obu_parser_fuzzer.cc
@@ -0,0 +1,89 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "examples/file_reader.h"
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_interface.h"
+#include "src/buffer_pool.h"
+#include "src/decoder_impl.h"
+#include "src/decoder_state.h"
+#include "src/internal_frame_buffer_list.h"
+#include "src/obu_parser.h"
+#include "tests/fuzzer/fuzzer_temp_file.h"
+
+namespace {
+
+#if defined(LIBGAV1_EXHAUSTIVE_FUZZING)
+// Set a large upper bound to give more coverage of a single input; this value
+// should be larger than most of the frame counts in the corpus.
+constexpr int kMaxFrames = 100;
+constexpr size_t kMaxDataSize = 400 * 1024;
+#else
+// Restrict the number of frames and obus to improve fuzzer throughput.
+constexpr int kMaxFrames = 5;
+constexpr size_t kMaxDataSize = 200 * 1024;
+#endif
+
+inline void ParseObu(const uint8_t* const data, size_t size) {
+  libgav1::InternalFrameBufferList buffer_list;
+  libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged,
+                                  libgav1::GetInternalFrameBuffer,
+                                  libgav1::ReleaseInternalFrameBuffer,
+                                  &buffer_list);
+  libgav1::DecoderState decoder_state;
+  libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state);
+  libgav1::RefCountedBufferPtr current_frame;
+  int parsed_frames = 0;
+  while (parser.HasData()) {
+    if (parser.ParseOneFrame(&current_frame) != libgav1::kStatusOk) break;
+    if (++parsed_frames >= kMaxFrames) break;
+  }
+}
+
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Reject large chunks of data to improve fuzzer throughput.
+  if (size > kMaxDataSize) return 0;
+
+  // Treat the input as a raw OBU stream.
+  ParseObu(data, size);
+
+  // Use the first frame from an IVF to bypass any read errors from the parser.
+  static constexpr size_t kIvfHeaderSize =
+      libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize;
+  if (size >= kIvfHeaderSize) {
+    ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize);
+  }
+
+  FuzzerTemporaryFile tempfile(data, size);
+  auto file_reader =
+      libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true);
+  if (file_reader == nullptr) return 0;
+
+  std::vector<uint8_t> buffer;
+  int parsed_frames = 0;
+  do {
+    if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break;
+    ParseObu(buffer.data(), buffer.size());
+    if (++parsed_frames >= kMaxFrames) break;
+  } while (!file_reader->IsEndOfFile());
+
+  return 0;
+}
diff --git a/tests/libgav1_tests.cmake b/tests/libgav1_tests.cmake
new file mode 100644
index 0000000..2b3f41c
--- /dev/null
+++ b/tests/libgav1_tests.cmake
@@ -0,0 +1,1375 @@
+# Copyright 2020 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_LIBGAV1_TESTS_CMAKE_)
+  return()
+endif() # LIBGAV1_LIBGAV1_TESTS_CMAKE_
+set(LIBGAV1_LIBGAV1_TESTS_CMAKE_ 1)
+
+set(libgav1_googletest "${libgav1_root}/third_party/googletest")
+if(NOT LIBGAV1_ENABLE_TESTS OR NOT EXISTS "${libgav1_googletest}")
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  if(LIBGAV1_ENABLE_TESTS AND NOT EXISTS "${libgav1_googletest}")
+    message(
+      "GoogleTest not found, setting LIBGAV1_ENABLE_TESTS to false.\n"
+      "To enable tests download the GoogleTest repository to"
+      " third_party/googletest:\n\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone \\\n"
+      "    https://github.com/google/googletest.git third_party/googletest\n")
+    set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  endif()
+  return()
+endif()
+
+# Check GoogleTest compiler requirements.
+if((CMAKE_CXX_COMPILER_ID
+    MATCHES
+    "Clang|GNU"
+    AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5")
+   OR (MSVC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "19"))
+  macro(libgav1_add_tests_targets)
+
+  endmacro()
+
+  message(
+    WARNING
+      "${CMAKE_CXX_COMPILER} (${CMAKE_CXX_COMPILER_ID} version"
+      " ${CMAKE_CXX_COMPILER_VERSION}) is below the minimum requirements for"
+      " GoogleTest; disabling unit tests. See"
+      " https://github.com/google/googletest#compilers for more detail.")
+  set(LIBGAV1_ENABLE_TESTS FALSE CACHE BOOL "Enables tests." FORCE)
+  return()
+endif()
+
+list(APPEND libgav1_tests_block_utils_sources
+            "${libgav1_root}/tests/block_utils.h"
+            "${libgav1_root}/tests/block_utils.cc")
+
+list(APPEND libgav1_tests_utils_sources
+            "${libgav1_root}/tests/third_party/libvpx/acm_random.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_helper.h"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.cc"
+            "${libgav1_root}/tests/third_party/libvpx/md5_utils.h"
+            "${libgav1_root}/tests/utils.h" "${libgav1_root}/tests/utils.cc")
+
+list(APPEND libgav1_tests_utils_test_sources
+            "${libgav1_root}/tests/utils_test.cc")
+
+list(APPEND libgav1_array_2d_test_sources
+            "${libgav1_source}/utils/array_2d_test.cc")
+list(APPEND libgav1_average_blend_test_sources
+            "${libgav1_source}/dsp/average_blend_test.cc")
+list(APPEND libgav1_block_parameters_holder_test_sources
+            "${libgav1_source}/utils/block_parameters_holder_test.cc")
+list(APPEND libgav1_blocking_counter_test_sources
+            "${libgav1_source}/utils/blocking_counter_test.cc")
+list(APPEND libgav1_buffer_pool_test_sources
+            "${libgav1_source}/buffer_pool_test.cc")
+list(APPEND libgav1_cdef_test_sources "${libgav1_source}/dsp/cdef_test.cc")
+list(
+  APPEND libgav1_common_test_sources "${libgav1_source}/utils/common_test.cc")
+list(APPEND libgav1_common_avx2_test_sources
+            "${libgav1_source}/dsp/x86/common_avx2.h"
+            "${libgav1_source}/dsp/x86/common_avx2.inc"
+            "${libgav1_source}/dsp/x86/common_avx2_test.cc"
+            "${libgav1_source}/dsp/x86/common_sse4.inc")
+list(APPEND libgav1_common_neon_test_sources
+            "${libgav1_source}/dsp/arm/common_neon_test.cc")
+list(APPEND libgav1_common_sse4_test_sources
+            "${libgav1_source}/dsp/x86/common_sse4.h"
+            "${libgav1_source}/dsp/x86/common_sse4.inc"
+            "${libgav1_source}/dsp/x86/common_sse4_test.cc")
+list(APPEND libgav1_convolve_test_sources
+            "${libgav1_source}/dsp/convolve_test.cc")
+list(APPEND libgav1_cpu_test_sources "${libgav1_source}/utils/cpu_test.cc")
+list(APPEND libgav1_c_decoder_test_sources "${libgav1_source}/c_decoder_test.c")
+list(APPEND libgav1_c_version_test_sources "${libgav1_source}/c_version_test.c")
+list(APPEND libgav1_decoder_test_sources "${libgav1_source}/decoder_test.cc")
+list(APPEND libgav1_decoder_buffer_test_sources
+            "${libgav1_source}/decoder_buffer_test.cc")
+list(APPEND libgav1_distance_weighted_blend_test_sources
+            "${libgav1_source}/dsp/distance_weighted_blend_test.cc")
+list(APPEND libgav1_dsp_test_sources "${libgav1_source}/dsp/dsp_test.cc")
+list(APPEND libgav1_entropy_decoder_test_sources
+            "${libgav1_source}/utils/entropy_decoder_test.cc"
+            "${libgav1_source}/utils/entropy_decoder_test_data.inc")
+list(APPEND libgav1_file_reader_test_sources
+            "${libgav1_examples}/file_reader_test.cc"
+            "${libgav1_examples}/file_reader_test_common.cc"
+            "${libgav1_examples}/file_reader_test_common.h")
+list(APPEND libgav1_film_grain_test_sources
+            "${libgav1_source}/film_grain_test.cc")
+list(APPEND libgav1_file_reader_factory_test_sources
+            "${libgav1_examples}/file_reader_factory_test.cc")
+list(APPEND libgav1_file_writer_test_sources
+            "${libgav1_examples}/file_writer_test.cc")
+list(APPEND libgav1_internal_frame_buffer_list_test_sources
+            "${libgav1_source}/internal_frame_buffer_list_test.cc")
+list(APPEND libgav1_intra_edge_test_sources
+            "${libgav1_source}/dsp/intra_edge_test.cc")
+list(APPEND libgav1_intrapred_cfl_test_sources
+            "${libgav1_source}/dsp/intrapred_cfl_test.cc")
+list(APPEND libgav1_intrapred_directional_test_sources
+            "${libgav1_source}/dsp/intrapred_directional_test.cc")
+list(APPEND libgav1_intrapred_filter_test_sources
+            "${libgav1_source}/dsp/intrapred_filter_test.cc")
+list(APPEND libgav1_intrapred_test_sources
+            "${libgav1_source}/dsp/intrapred_test.cc")
+list(APPEND libgav1_inverse_transform_test_sources
+            "${libgav1_source}/dsp/inverse_transform_test.cc")
+list(APPEND libgav1_loop_filter_test_sources
+            "${libgav1_source}/dsp/loop_filter_test.cc")
+list(APPEND libgav1_loop_restoration_test_sources
+            "${libgav1_source}/dsp/loop_restoration_test.cc")
+list(APPEND libgav1_mask_blend_test_sources
+            "${libgav1_source}/dsp/mask_blend_test.cc")
+list(APPEND libgav1_motion_field_projection_test_sources
+            "${libgav1_source}/dsp/motion_field_projection_test.cc")
+list(APPEND libgav1_motion_vector_search_test_sources
+            "${libgav1_source}/dsp/motion_vector_search_test.cc")
+list(APPEND libgav1_super_res_test_sources
+            "${libgav1_source}/dsp/super_res_test.cc")
+list(APPEND libgav1_weight_mask_test_sources
+            "${libgav1_source}/dsp/weight_mask_test.cc")
+list(
+  APPEND libgav1_memory_test_sources "${libgav1_source}/utils/memory_test.cc")
+list(APPEND libgav1_obmc_test_sources "${libgav1_source}/dsp/obmc_test.cc")
+list(APPEND libgav1_obu_parser_test_sources
+            "${libgav1_source}/obu_parser_test.cc")
+list(APPEND libgav1_post_filter_test_sources
+            "${libgav1_source}/post_filter_test.cc")
+list(APPEND libgav1_prediction_mask_test_sources
+            "${libgav1_source}/prediction_mask_test.cc")
+list(
+  APPEND libgav1_quantizer_test_sources "${libgav1_source}/quantizer_test.cc")
+list(APPEND libgav1_queue_test_sources "${libgav1_source}/utils/queue_test.cc")
+list(APPEND libgav1_raw_bit_reader_test_sources
+            "${libgav1_source}/utils/raw_bit_reader_test.cc")
+list(APPEND libgav1_reconstruction_test_sources
+            "${libgav1_source}/reconstruction_test.cc")
+list(APPEND libgav1_residual_buffer_pool_test_sources
+            "${libgav1_source}/residual_buffer_pool_test.cc")
+list(APPEND libgav1_scan_test_sources "${libgav1_source}/scan_test.cc")
+list(APPEND libgav1_segmentation_map_test_sources
+            "${libgav1_source}/utils/segmentation_map_test.cc")
+list(APPEND libgav1_segmentation_test_sources
+            "${libgav1_source}/utils/segmentation_test.cc")
+list(APPEND libgav1_stack_test_sources "${libgav1_source}/utils/stack_test.cc")
+list(APPEND libgav1_symbol_decoder_context_test_sources
+            "${libgav1_source}/symbol_decoder_context_test.cc")
+list(APPEND libgav1_threadpool_test_sources
+            "${libgav1_source}/utils/threadpool_test.cc")
+list(APPEND libgav1_threading_strategy_test_sources
+            "${libgav1_source}/threading_strategy_test.cc")
+list(APPEND libgav1_unbounded_queue_test_sources
+            "${libgav1_source}/utils/unbounded_queue_test.cc")
+list(
+  APPEND libgav1_vector_test_sources "${libgav1_source}/utils/vector_test.cc")
+list(APPEND libgav1_version_test_sources "${libgav1_source}/version_test.cc")
+list(APPEND libgav1_warp_test_sources "${libgav1_source}/dsp/warp_test.cc")
+list(APPEND libgav1_warp_prediction_test_sources
+            "${libgav1_source}/warp_prediction_test.cc")
+
+macro(libgav1_add_tests_targets)
+  if(NOT LIBGAV1_ENABLE_TESTS)
+    message(
+      FATAL_ERROR
+        "This version of libgav1_add_tests_targets() should only be used with"
+        " LIBGAV1_ENABLE_TESTS set to true.")
+  endif()
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest-all.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_gtest_main
+                      TYPE
+                      STATIC
+                      SOURCES
+                      "${libgav1_googletest}/googletest/src/gtest_main.cc"
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_gtest_include_paths}
+                      ${libgav1_include_paths})
+
+  if(ANDROID OR IOS)
+    if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+      set(use_absl_threading TRUE)
+    endif()
+  elseif(NOT
+         (DEFINED
+          LIBGAV1_THREADPOOL_USE_STD_MUTEX
+          AND LIBGAV1_THREADPOOL_USE_STD_MUTEX))
+    set(use_absl_threading TRUE)
+  endif()
+
+  if(use_absl_threading)
+    list(APPEND libgav1_common_test_absl_deps absl::synchronization)
+  endif()
+
+  libgav1_add_executable(TEST
+                         NAME
+                         array_2d_test
+                         SOURCES
+                         ${libgav1_array_2d_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         block_parameters_holder_test
+                         SOURCES
+                         ${libgav1_block_parameters_holder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         blocking_counter_test
+                         SOURCES
+                         ${libgav1_blocking_counter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  if(libgav1_have_avx2)
+    libgav1_add_executable(TEST
+                           NAME
+                           common_avx2_test
+                           SOURCES
+                           ${libgav1_common_avx2_test_sources}
+                           DEFINES
+                           ${libgav1_defines}
+                           INCLUDES
+                           ${libgav1_test_include_paths}
+                           LIB_DEPS
+                           ${libgav1_common_test_absl_deps}
+                           libgav1_gtest
+                           libgav1_gtest_main)
+  endif()
+
+  if(libgav1_have_neon)
+    libgav1_add_executable(TEST
+                           NAME
+                           common_neon_test
+                           SOURCES
+                           ${libgav1_common_neon_test_sources}
+                           DEFINES
+                           ${libgav1_defines}
+                           INCLUDES
+                           ${libgav1_test_include_paths}
+                           OBJLIB_DEPS
+                           libgav1_tests_block_utils
+                           LIB_DEPS
+                           ${libgav1_common_test_absl_deps}
+                           libgav1_gtest
+                           libgav1_gtest_main)
+  endif()
+
+  if(libgav1_have_sse4)
+    libgav1_add_executable(TEST
+                           NAME
+                           common_sse4_test
+                           SOURCES
+                           ${libgav1_common_sse4_test_sources}
+                           DEFINES
+                           ${libgav1_defines}
+                           INCLUDES
+                           ${libgav1_test_include_paths}
+                           LIB_DEPS
+                           ${libgav1_common_test_absl_deps}
+                           libgav1_gtest
+                           libgav1_gtest_main)
+  endif()
+
+  libgav1_add_executable(TEST
+                         NAME
+                         common_test
+                         SOURCES
+                         ${libgav1_common_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         cpu_test
+                         SOURCES
+                         ${libgav1_cpu_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         entropy_decoder_test
+                         SOURCES
+                         ${libgav1_entropy_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_reader_test
+                         SOURCES
+                         ${libgav1_file_reader_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_dsp
+                         libgav1_file_reader
+                         libgav1_utils
+                         libgav1_tests_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_reader_factory_test
+                         SOURCES
+                         ${libgav1_file_reader_factory_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::memory
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         film_grain_test
+                         SOURCES
+                         ${libgav1_film_grain_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         memory_test
+                         SOURCES
+                         ${libgav1_memory_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         absl::base
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         queue_test
+                         SOURCES
+                         ${libgav1_queue_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         segmentation_map_test
+                         SOURCES
+                         ${libgav1_segmentation_map_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         segmentation_test
+                         SOURCES
+                         ${libgav1_segmentation_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         stack_test
+                         SOURCES
+                         ${libgav1_stack_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         symbol_decoder_context_test
+                         SOURCES
+                         ${libgav1_symbol_decoder_context_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         threadpool_test
+                         SOURCES
+                         ${libgav1_threadpool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::synchronization
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         unbounded_queue_test
+                         SOURCES
+                         ${libgav1_unbounded_queue_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         tests_utils_test
+                         SOURCES
+                         ${libgav1_tests_utils_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         vector_test
+                         SOURCES
+                         ${libgav1_vector_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         version_test
+                         SOURCES
+                         ${libgav1_version_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_block_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_block_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_library(TEST
+                      NAME
+                      libgav1_tests_utils
+                      TYPE
+                      OBJECT
+                      SOURCES
+                      ${libgav1_tests_utils_sources}
+                      DEFINES
+                      ${libgav1_defines}
+                      INCLUDES
+                      ${libgav1_test_include_paths})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         average_blend_test
+                         SOURCES
+                         ${libgav1_average_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         buffer_pool_test
+                         SOURCES
+                         ${libgav1_buffer_pool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         cdef_test
+                         SOURCES
+                         ${libgav1_cdef_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         convolve_test
+                         SOURCES
+                         ${libgav1_convolve_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         c_decoder_test
+                         SOURCES
+                         ${libgav1_c_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         c_version_test
+                         SOURCES
+                         ${libgav1_c_version_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency})
+
+  libgav1_add_executable(TEST
+                         NAME
+                         decoder_test
+                         SOURCES
+                         ${libgav1_decoder_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         decoder_buffer_test
+                         SOURCES
+                         ${libgav1_decoder_buffer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         LIB_DEPS
+                         ${libgav1_dependency}
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         distance_weighted_blend_test
+                         SOURCES
+                         ${libgav1_distance_weighted_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         dsp_test
+                         SOURCES
+                         ${libgav1_dsp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         file_writer_test
+                         SOURCES
+                         ${libgav1_file_writer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_file_writer
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::memory
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_cfl_test
+                         SOURCES
+                         ${libgav1_intrapred_cfl_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_directional_test
+                         SOURCES
+                         ${libgav1_intrapred_directional_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_filter_test
+                         SOURCES
+                         ${libgav1_intrapred_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intrapred_test
+                         SOURCES
+                         ${libgav1_intrapred_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         intra_edge_test
+                         SOURCES
+                         ${libgav1_intra_edge_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         inverse_transform_test
+                         SOURCES
+                         ${libgav1_inverse_transform_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         internal_frame_buffer_list_test
+                         SOURCES
+                         ${libgav1_internal_frame_buffer_list_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_filter_test
+                         SOURCES
+                         ${libgav1_loop_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         loop_restoration_test
+                         SOURCES
+                         ${libgav1_loop_restoration_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         mask_blend_test
+                         SOURCES
+                         ${libgav1_mask_blend_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_field_projection_test
+                         SOURCES
+                         ${libgav1_motion_field_projection_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         motion_vector_search_test
+                         SOURCES
+                         ${libgav1_motion_vector_search_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         obmc_test
+                         SOURCES
+                         ${libgav1_obmc_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         obu_parser_test
+                         SOURCES
+                         ${libgav1_obu_parser_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         post_filter_test
+                         SOURCES
+                         ${libgav1_post_filter_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         prediction_mask_test
+                         SOURCES
+                         ${libgav1_prediction_mask_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::strings
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         quantizer_test
+                         SOURCES
+                         ${libgav1_quantizer_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         raw_bit_reader_test
+                         SOURCES
+                         ${libgav1_raw_bit_reader_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         reconstruction_test
+                         SOURCES
+                         ${libgav1_reconstruction_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         absl::strings
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         residual_buffer_pool_test
+                         SOURCES
+                         ${libgav1_residual_buffer_pool_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         scan_test
+                         SOURCES
+                         ${libgav1_scan_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         super_res_test
+                         SOURCES
+                         ${libgav1_super_res_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         threading_strategy_test
+                         SOURCES
+                         ${libgav1_threading_strategy_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         ${libgav1_test_objlib_deps}
+                         LIB_DEPS
+                         absl::str_format_internal
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         warp_test
+                         SOURCES
+                         ${libgav1_warp_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_block_utils
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         warp_prediction_test
+                         SOURCES
+                         ${libgav1_warp_prediction_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_utils
+                         LIB_DEPS
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+
+  libgav1_add_executable(TEST
+                         NAME
+                         weight_mask_test
+                         SOURCES
+                         ${libgav1_weight_mask_test_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_test_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_decoder
+                         libgav1_dsp
+                         libgav1_tests_utils
+                         libgav1_utils
+                         LIB_DEPS
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_common_test_absl_deps}
+                         libgav1_gtest
+                         libgav1_gtest_main)
+endmacro()
diff --git a/tests/third_party/libvpx/LICENSE b/tests/third_party/libvpx/LICENSE
new file mode 100644
index 0000000..83ef339
--- /dev/null
+++ b/tests/third_party/libvpx/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2010, The WebM Project authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/tests/third_party/libvpx/acm_random.h b/tests/third_party/libvpx/acm_random.h
new file mode 100644
index 0000000..e8cfc9c
--- /dev/null
+++ b/tests/third_party/libvpx/acm_random.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
+
+#include <cassert>
+#include <cstdint>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) { random_.Reseed(seed); }
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  int32_t Rand20Signed(void) {
+    // Use 20 bits: values between 524287 and -524288.
+    const uint32_t value = random_.Generate(1048576);
+    return static_cast<int32_t>(value) - 524288;
+  }
+
+  int16_t Rand16Signed(void) {
+    // Use 16 bits: values between 32767 and -32768.
+    return static_cast<int16_t>(random_.Generate(65536));
+  }
+
+  int16_t Rand13Signed(void) {
+    // Use 13 bits: values between 4095 and -4096.
+    const uint32_t value = random_.Generate(8192);
+    return static_cast<int16_t>(value) - 4096;
+  }
+
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+  }
+
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
+  int PseudoUniform(int range) { return random_.Generate(range); }
+
+  int operator()(int n) { return PseudoUniform(n); }
+
+  static constexpr int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_ACM_RANDOM_H_
diff --git a/tests/third_party/libvpx/md5_helper.h b/tests/third_party/libvpx/md5_helper.h
new file mode 100644
index 0000000..c97b590
--- /dev/null
+++ b/tests/third_party/libvpx/md5_helper.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+namespace libvpx_test {
+class MD5 {
+ public:
+  MD5() { MD5Init(&md5_); }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+        '0', '1', '2', '3', '4', '5', '6', '7',
+        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0] = hex[tmp[i] >> 4];
+      res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_HELPER_H_
diff --git a/tests/third_party/libvpx/md5_utils.cc b/tests/third_party/libvpx/md5_utils.cc
new file mode 100644
index 0000000..4638e54
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.cc
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include "tests/third_party/libvpx/md5_utils.h"
+
+#include <cstring>
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1) return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++; /* Carry from low to high */
+
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) { /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
+  UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/tests/third_party/libvpx/md5_utils.h b/tests/third_party/libvpx/md5_utils.h
new file mode 100644
index 0000000..13be035
--- /dev/null
+++ b/tests/third_party/libvpx/md5_utils.h
@@ -0,0 +1,41 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+#define LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#endif  // LIBGAV1_TESTS_THIRD_PARTY_LIBVPX_MD5_UTILS_H_
diff --git a/tests/utils.cc b/tests/utils.cc
new file mode 100644
index 0000000..e91ea87
--- /dev/null
+++ b/tests/utils.cc
@@ -0,0 +1,197 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "gtest/gtest.h"
+#include "src/dsp/dsp.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/constants.h"
+#include "tests/third_party/libvpx/md5_helper.h"
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+int CloseFile(FILE* stream) { return fclose(stream); }
+
+bool ReadFileToString(absl::string_view file_name, std::string* const string) {
+  using FilePtr = std::unique_ptr<FILE, decltype(&CloseFile)>;
+  FilePtr file(fopen(std::string(file_name).c_str(), "rb"), &CloseFile);
+  if (file == nullptr) return false;
+
+  do {
+    int c = fgetc(file.get());
+    if (ferror(file.get()) != 0) return false;
+
+    if (c != EOF) {
+      string->append(1, static_cast<char>(c));
+    } else {
+      break;
+    }
+  } while (true);
+
+  return true;
+}
+
+}  // namespace
+
+void ResetDspTable(const int bitdepth) {
+  dsp::Dsp* const dsp = dsp_internal::GetWritableDspTable(bitdepth);
+  ASSERT_NE(dsp, nullptr);
+  memset(dsp, 0, sizeof(dsp::Dsp));
+}
+
+std::string GetMd5Sum(const void* bytes, size_t size) {
+  libvpx_test::MD5 md5;
+  md5.Add(static_cast<const uint8_t*>(bytes), size);
+  return md5.Get();
+}
+
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride) {
+  libvpx_test::MD5 md5;
+  const Pixel* row = block;
+  for (int i = 0; i < height; ++i) {
+    md5.Add(reinterpret_cast<const uint8_t*>(row), width * sizeof(Pixel));
+    row += stride;
+  }
+  return md5.Get();
+}
+
+template std::string GetMd5Sum(const int8_t* block, int width, int height,
+                               int stride);
+template std::string GetMd5Sum(const int16_t* block, int width, int height,
+                               int stride);
+
+std::string GetMd5Sum(const DecoderBuffer& buffer) {
+  libvpx_test::MD5 md5;
+  const size_t pixel_size =
+      (buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane = kPlaneY; plane < buffer.NumPlanes(); ++plane) {
+    const int height = buffer.displayed_height[plane];
+    const size_t width = buffer.displayed_width[plane] * pixel_size;
+    const int stride = buffer.stride[plane];
+    const uint8_t* plane_buffer = buffer.plane[plane];
+    for (int row = 0; row < height; ++row) {
+      md5.Add(plane_buffer, width);
+      plane_buffer += stride;
+    }
+  }
+  return md5.Get();
+}
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time) {
+  const std::string digest = test_utils::GetMd5Sum(data, size);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time) {
+  const std::string digest =
+      test_utils::GetMd5Sum(block, width, height, stride);
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         digest.c_str());
+  EXPECT_STREQ(expected_digest, digest.c_str());
+}
+
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int8_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+template void CheckMd5Digest(const char name[], const char function_name[],
+                             const char expected_digest[], const int16_t* block,
+                             int width, int height, int stride,
+                             absl::Duration elapsed_time);
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time) {
+  printf("Mode %s[%31s]: %5d us     MD5: %s\n", name, function_name,
+         static_cast<int>(absl::ToInt64Microseconds(elapsed_time)),
+         actual_digest);
+  EXPECT_STREQ(expected_digest, actual_digest);
+}
+
+namespace {
+
+std::string GetSourceDir() {
+#if defined(__ANDROID__)
+  // Test files must be manually supplied. This path is frequently
+  // available on development devices.
+  return std::string("/data/local/tmp/tests/data");
+#elif defined(LIBGAV1_FLAGS_SRCDIR)
+  return std::string(LIBGAV1_FLAGS_SRCDIR) + "/tests/data";
+#else
+  return std::string(".");
+#endif  // defined(__ANDROID__)
+}
+
+std::string GetTempDir() {
+  const char* path = getenv("TMPDIR");
+  if (path == nullptr || path[0] == '\0') path = getenv("TEMP");
+  if (path != nullptr && path[0] != '\0') return std::string(path);
+
+#if defined(__ANDROID__)
+  return std::string("/data/local/tmp");
+#elif defined(LIBGAV1_FLAGS_TMPDIR)
+  return std::string(LIBGAV1_FLAGS_TMPDIR);
+#else
+  return std::string(".");
+#endif  // defined(__ANDROID__)
+}
+
+}  // namespace
+
+std::string GetTestInputFilePath(absl::string_view file_name) {
+  const char* const path = getenv("LIBGAV1_TEST_DATA_PATH");
+  if (path != nullptr && path[0] != '\0') {
+    return std::string(path) + "/" + std::string(file_name);
+  }
+  return GetSourceDir() + "/" + std::string(file_name);
+}
+
+std::string GetTestOutputFilePath(absl::string_view file_name) {
+  return GetTempDir() + "/" + std::string(file_name);
+}
+
+void GetTestData(absl::string_view file_name, const bool is_output_file,
+                 std::string* const output) {
+  ASSERT_NE(output, nullptr);
+  const std::string absolute_file_path = is_output_file
+                                             ? GetTestOutputFilePath(file_name)
+                                             : GetTestInputFilePath(file_name);
+
+  ASSERT_TRUE(ReadFileToString(absolute_file_path, output));
+}
+
+}  // namespace test_utils
+}  // namespace libgav1
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..4d73070
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_TESTS_UTILS_H_
+#define LIBGAV1_TESTS_UTILS_H_
+
+#include <cstddef>
+#include <new>
+#include <string>
+
+#include "absl/base/config.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/utils/memory.h"
+#include "tests/third_party/libvpx/acm_random.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+
+enum { kAlternateDeterministicSeed = 0x9571 };
+static_assert(kAlternateDeterministicSeed !=
+                  libvpx_test::ACMRandom::DeterministicSeed(),
+              "");
+
+// Similar to libgav1::MaxAlignedAllocable, but retains the throwing versions
+// of new to support googletest allocations.
+struct MaxAlignedAllocable {
+  // Class-specific allocation functions.
+  static void* operator new(size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new(size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+  static void* operator new[](size_t size) {
+    void* const p =
+        libgav1::MaxAlignedAllocable::operator new[](size, std::nothrow);
+#ifdef ABSL_HAVE_EXCEPTIONS
+    if (p == nullptr) throw std::bad_alloc();
+#endif
+    return p;
+  }
+
+  // Class-specific non-throwing allocation functions
+  static void* operator new(size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new(size, tag);
+  }
+  static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept {
+    return libgav1::MaxAlignedAllocable::operator new[](size, tag);
+  }
+
+  // Class-specific deallocation functions.
+  static void operator delete(void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr);
+  }
+  static void operator delete[](void* ptr) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr);
+  }
+
+  // Only called if new (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete(ptr, tag);
+  }
+  // Only called if new[] (std::nothrow) is used and the constructor throws an
+  // exception.
+  static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept {
+    libgav1::MaxAlignedAllocable::operator delete[](ptr, tag);
+  }
+};
+
+// Clears dsp table entries for |bitdepth|. This function is not thread safe.
+void ResetDspTable(int bitdepth);
+
+//------------------------------------------------------------------------------
+// Gets human readable hexadecimal encoded MD5 sum from given data, block, or
+// frame buffer.
+
+std::string GetMd5Sum(const void* bytes, size_t size);
+template <typename Pixel>
+std::string GetMd5Sum(const Pixel* block, int width, int height, int stride);
+std::string GetMd5Sum(const DecoderBuffer& buffer);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |size| bytes of |data| with |expected_digest|.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const void* data, size_t size,
+                    absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares the md5 digest of |block| with |expected_digest|. The width, height,
+// and stride of |block| are |width|, |height|, and |stride|, respectively.
+// Prints a log message with |name|, |function_name|, md5 digest and
+// |elapsed_time|. |name| and |function_name| are merely tags used for logging
+// and can be any meaningful string depending on the caller's context.
+
+template <typename Pixel>
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const Pixel* block, int width,
+                    int height, int stride, absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Compares |actual_digest| with |expected_digest|. Prints a log message with
+// |name|, |function_name|, md5 digest and |elapsed_time|. |name| and
+// |function_name| are merely tags used for logging and can be any meaningful
+// string depending on the caller's context.
+
+void CheckMd5Digest(const char name[], const char function_name[],
+                    const char expected_digest[], const char actual_digest[],
+                    absl::Duration elapsed_time);
+
+//------------------------------------------------------------------------------
+// Reads the test data from |file_name| as a string into |output|. The
+// |is_output_file| argument controls the expansion of |file_name| to its full
+// path. When |is_output_file| is true GetTestData() reads from
+// utils.cc::GetTempDir(), and when it is false the file is read from
+// utils.cc::GetSourceDir().
+void GetTestData(absl::string_view file_name, bool is_output_file,
+                 std::string* output);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| from libgav1/tests/data.
+std::string GetTestInputFilePath(absl::string_view file_name);
+
+//------------------------------------------------------------------------------
+// Returns the full path to |file_name| in a location where the file can be
+// opened for writing.
+std::string GetTestOutputFilePath(absl::string_view file_name);
+
+}  // namespace test_utils
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TESTS_UTILS_H_
diff --git a/tests/utils_test.cc b/tests/utils_test.cc
new file mode 100644
index 0000000..1d5b598
--- /dev/null
+++ b/tests/utils_test.cc
@@ -0,0 +1,190 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tests/utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <new>
+
+#include "absl/base/config.h"
+#include "gtest/gtest.h"
+#include "src/utils/memory.h"
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+#include <exception>
+#endif
+
+namespace libgav1 {
+namespace test_utils {
+namespace {
+
+constexpr size_t kMaxAllocableSize = 0x40000000;
+
+// Has a trivial default constructor that performs no action.
+struct SmallMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x;
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct SmallMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x = 0;
+};
+
+// Has a trivial default constructor that performs no action.
+struct HugeMaxAligned : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1];
+};
+
+// Has a nontrivial default constructor that initializes the data member.
+struct HugeMaxAlignedNontrivialConstructor : public MaxAlignedAllocable {
+  alignas(kMaxAlignment) uint8_t x[kMaxAllocableSize + 1] = {};
+};
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+struct MaxAlignedThrowingConstructor : public MaxAlignedAllocable {
+  MaxAlignedThrowingConstructor() { throw std::exception(); }
+
+  uint8_t x;
+};
+#endif
+
+TEST(TestUtilsTest, TestMaxAlignedAllocable) {
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned> small(new (std::nothrow) SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<SmallMaxAligned> small(new SmallMaxAligned);
+    EXPECT_NE(small, nullptr);
+    // Note this check doesn't guarantee conformance as a suitably aligned
+    // address may be returned from any allocator.
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small.get()) & (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new (std::nothrow) SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAligned[]> small_array_of_smalls(
+        new SmallMaxAligned[10]);
+    EXPECT_NE(small_array_of_smalls, nullptr);
+    EXPECT_EQ(reinterpret_cast<uintptr_t>(small_array_of_smalls.get()) &
+                  (kMaxAlignment - 1),
+              0);
+    // MaxAlignedAllocable::operator delete[] is called.
+  }
+
+  {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    std::unique_ptr<HugeMaxAligned> huge(new (std::nothrow) HugeMaxAligned);
+    EXPECT_EQ(huge, nullptr);
+  }
+
+  {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    std::unique_ptr<SmallMaxAligned[]> huge_array_of_smalls(
+        new (std::nothrow)
+            SmallMaxAligned[kMaxAllocableSize / sizeof(SmallMaxAligned) + 1]);
+    EXPECT_EQ(huge_array_of_smalls, nullptr);
+  }
+
+#ifdef ABSL_HAVE_EXCEPTIONS
+  try {
+    // MaxAlignedAllocable::operator new (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete is called.
+    auto* always = new MaxAlignedThrowingConstructor;
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] (std::nothrow) is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] (std::nothrow) is called.
+    auto* always = new (std::nothrow) MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    // The constructor throws an exception.
+    // MaxAlignedAllocable::operator delete[] is called.
+    auto* always = new MaxAlignedThrowingConstructor[2];
+    static_cast<void>(always);
+  } catch (...) {
+  }
+
+  // Note these calls are only safe with exceptions enabled as if the throwing
+  // operator new returns the object is expected to be valid. In this case an
+  // attempt to invoke the object's constructor on a nullptr may be made which
+  // is undefined behavior.
+  try {
+    // MaxAlignedAllocable::operator new is called.
+    std::unique_ptr<HugeMaxAlignedNontrivialConstructor> huge(
+        new HugeMaxAlignedNontrivialConstructor);
+    ADD_FAILURE() << "huge allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+
+  try {
+    // MaxAlignedAllocable::operator new[] is called.
+    std::unique_ptr<SmallMaxAlignedNontrivialConstructor[]>
+        huge_array_of_smalls(
+            new SmallMaxAlignedNontrivialConstructor
+                [kMaxAllocableSize /
+                     sizeof(SmallMaxAlignedNontrivialConstructor) +
+                 1]);
+    ADD_FAILURE() << "huge_array_of_smalls allocation should fail.";
+  } catch (...) {
+    SUCCEED();
+  }
+#endif  // ABSL_HAVE_EXCEPTIONS
+}
+
+}  // namespace
+}  // namespace test_utils
+}  // namespace libgav1
-- 
2.30.2